@@ -3,13 +3,15 @@ gpu.module @test_module {
33 gpu.func @test_static_memref (%arg0 : memref <512 x128 xf16 , strided <[1 , 512 ], offset :0 >>, %arg1 : index , %arg2 : index ) {
44 %0 = xetile.init_tile %arg0 [%arg1 , %arg2 ] : memref <512 x128 xf16 , strided <[1 , 512 ], offset :0 >> -> !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
55 %3 = xetile.load_tile %0 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>> -> vector <16 x32 xf16 >
6+ xetile.prefetch_tile %0 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
67 // With static offsets
78 %1 = xetile.init_tile %arg0 [12 , %arg1 ] : memref <512 x128 xf16 , strided <[1 , 512 ], offset :0 >> -> !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
89 // Update offsets
910 %c16 = arith.constant 16 : index
1011 %c32 = arith.constant 32 : index
1112 %2 = xetile.update_tile_offset %1 , [%c32 , %c16 ] : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>, index , index -> !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
1213 %4 = xetile.load_tile %2 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>> -> vector <16 x32 xf16 >
14+ xetile.prefetch_tile %1 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
1315 gpu.return
1416 }
1517}
@@ -23,22 +25,25 @@ gpu.module @test_module {
2325// CHECK: %[[T0:.*]] = xetile.init_tile %[[RCAST]][%[[ARG2]], %[[ARG1]]] : memref<128x512xf16, strided<[512, 1]>> -> !xetile.tile<32x16xf16, #xetile.tile_attr<>>
2426// CHECK: %[[T1:.*]] = xetile.load_tile %[[T0]] { padding = 0.000000e+00 : f32 } : !xetile.tile<32x16xf16, #xetile.tile_attr<>> -> vector<32x16xf16>
2527// CHECK: %[[T2:.*]] = xetile.transpose %[[T1]], [1, 0] : vector<32x16xf16> -> vector<16x32xf16>
28+ // CHECK: xetile.prefetch_tile %[[T0]] : !xetile.tile<32x16xf16, #xetile.tile_attr<>>
2629// CHECK: %[[RCAST0:.*]] = memref.reinterpret_cast %[[ARG0]] to offset: [0], sizes: [128, 512], strides: [512, 1] : memref<512x128xf16, strided<[1, 512]>> to memref<128x512xf16, strided<[512, 1]>>
2730// CHECK: %[[T3:.*]] = xetile.init_tile %[[RCAST0]][%[[ARG1]], 12] : memref<128x512xf16, strided<[512, 1]>> -> !xetile.tile<32x16xf16, #xetile.tile_attr<>>
2831// CHECK: %[[T4:.*]] = xetile.update_tile_offset %[[T3]], [%[[C16]], %[[C32]]] : !xetile.tile<32x16xf16, #xetile.tile_attr<>>, index, index -> !xetile.tile<32x16xf16, #xetile.tile_attr<>>
2932// CHECK: %[[T5:.*]] = xetile.load_tile %[[T4]] { padding = 0.000000e+00 : f32 } : !xetile.tile<32x16xf16, #xetile.tile_attr<>> -> vector<32x16xf16>
3033// CHECK: %[[T6:.*]] = xetile.transpose %[[T5]], [1, 0] : vector<32x16xf16> -> vector<16x32xf16>
34+ // CHECK: xetile.prefetch_tile %[[T3]] : !xetile.tile<32x16xf16, #xetile.tile_attr<>>
3135
3236// -----
3337gpu.module @test_module {
3438 gpu.func @test_dynamic_memref (%arg0 : memref <?x?xf16 >, %arg1 : index , %arg2 : index , %arg3 : index , %arg4 : index , %arg5 : index , %arg6 : index ) {
3539 %0 = xetile.init_tile %arg0 [%arg1 , %arg2 ], [%arg3 , %arg4 ], [%arg5 , %arg6 ] : memref <?x?xf16 > -> !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
36- %1 = xetile.load_tile %0 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>> -> vector <16 x32 xf16 >
40+ xetile.load_tile %0 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>> -> vector <16 x32 xf16 >
3741 // Update offsets
3842 %c16 = arith.constant 16 : index
3943 %c32 = arith.constant 32 : index
4044 %2 = xetile.update_tile_offset %0 , [%c32 , %c16 ] : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>, index , index -> !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
4145 %3 = xetile.load_tile %2 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>> -> vector <16 x32 xf16 >
46+ xetile.prefetch_tile %2 : !xetile.tile <16 x32 xf16 , #xetile.tile_attr <order =[0 ,1 ]>>
4247 gpu.return
4348 }
4449}
@@ -59,6 +64,7 @@ gpu.module @test_module {
5964// CHECK: %[[T3:.*]] = xetile.update_tile_offset %[[T0]], [%[[C16]], %[[C32]]] : !xetile.tile<32x16xf16, #xetile.tile_attr<>>, index, index -> !xetile.tile<32x16xf16, #xetile.tile_attr<>>
6065// CHECK: %[[T4:.*]] = xetile.load_tile %[[T3]] { padding = 0.000000e+00 : f32 } : !xetile.tile<32x16xf16, #xetile.tile_attr<>> -> vector<32x16xf16>
6166// CHECK: %[[T5:.*]] = xetile.transpose %[[T4]], [1, 0] : vector<32x16xf16> -> vector<16x32xf16>
67+ // CHECK: xetile.prefetch_tile %[[T3]] : !xetile.tile<32x16xf16, #xetile.tile_attr<>>
6268
6369// -----
6470gpu.module @test_module {
@@ -272,10 +278,37 @@ gpu.module @test_module {
272278 }
273279}
274280
281+ // CHECK-LABEL: @test_multireduction_1
282+ // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: vector<64x256xf32>, %[[ARG1:[a-zA-Z0-9]+]]: vector<256xf32>) -> vector<256xf32>
283+ // CHECK: %[[T0:.*]] = xetile.reduction <add>, %[[ARG0]] [0] : vector<64x256xf32> -> vector<1x256xf32>
284+ // CHECK: %[[T1:.*]] = vector.shape_cast %[[T0]] : vector<1x256xf32> to vector<256xf32>
285+ // CHECK: %[[T2:.*]] = arith.addf %[[T1]], %[[ARG1]] : vector<256xf32>
286+ // CHECK: gpu.return %[[T2]] : vector<256xf32>
287+
275288// -----
276289gpu.module @test_module {
277290 gpu.func @test_multireduction_2 (%arg0 : vector <64 x256 xi8 >, %arg1 : vector <256 xi8 >) -> vector <256 xi8 > {
278291 %0 = vector.multi_reduction <add >, %arg0 , %arg1 [0 ] : vector <64 x256 xi8 > to vector <256 xi8 >
279292 gpu.return %0 : vector <256 xi8 >
280293 }
281294}
295+
296+ // CHECK-LABEL: @test_multireduction_2
297+ // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: vector<64x256xi8>, %[[ARG1:[a-zA-Z0-9]+]]: vector<256xi8>) -> vector<256xi8>
298+ // CHECK: %[[T0:.*]] = xetile.reduction <add>, %[[ARG0]] [0] : vector<64x256xi8> -> vector<1x256xi8>
299+ // CHECK: %[[T1:.*]] = vector.shape_cast %[[T0]] : vector<1x256xi8> to vector<256xi8>
300+ // CHECK: %[[T2:.*]] = arith.addi %[[T1]], %[[ARG1]] : vector<256xi8>
301+ // CHECK: gpu.return %[[T2]] : vector<256xi8>
302+
303+ // -----
304+ gpu.module @test_module {
305+ gpu.func @test_transpose_1 (%arg0 : vector <16 x32 xf32 >) -> vector <32 x16 xf32 > {
306+ %0 = vector.transpose %arg0 , [1 , 0 ] : vector <16 x32 xf32 > to vector <32 x16 xf32 >
307+ gpu.return %0 : vector <32 x16 xf32 >
308+ }
309+ }
310+
311+ // CHECK-LABEL: @test_transpose_1
312+ // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: vector<16x32xf32>) -> vector<32x16xf32>
313+ // CHECK: %[[T0:.*]] = xetile.transpose %arg0, [1, 0] : vector<16x32xf32> -> vector<32x16xf32>
314+ // CHECK: gpu.return %[[T0]] : vector<32x16xf32>
0 commit comments