Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/composable_kernel
234 changes: 117 additions & 117 deletions aiter/configs/a8w8_blockscale_tuned_gemm.csv

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions aiter/configs/a8w8_blockscale_wpreshuffle_tuned_gemm.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
M,N,K,kernelId,splitK,us,kernelName
16,1536,7168,8,0,20.3733,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,3072,1536,8,0,7.2748,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,576,7168,8,0,19.6829,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,7168,256,7,0,3.808,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
16,7168,2048,8,0,10.7236,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,4608,7168,8,0,20.9385,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,7168,2304,8,0,11.3236,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,512,7168,8,0,19.6857,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
16,4096,512,8,0,3.6836,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
32,1536,7168,8,0,20.6333,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
32,3072,1536,13,0,8.5716,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
32,576,7168,8,0,19.6709,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
32,7168,256,13,0,4.62,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
32,7168,2048,13,0,13.9641,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
32,4608,7168,13,0,24.8581,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
32,7168,2304,13,0,14.4225,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
32,512,7168,8,0,19.5977,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
32,4096,512,8,0,5.0272,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
64,1536,7168,8,0,24.1181,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
64,3072,1536,8,0,10.6444,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
64,576,7168,8,0,19.6377,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
64,7168,256,7,0,7.3096,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
64,7168,2048,13,0,18.8161,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
64,4608,7168,18,0,35.849,a8w8_blockscale_wpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
64,7168,2304,13,0,19.6389,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
64,512,7168,8,0,19.6893,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
64,4096,512,8,0,7.0608,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
128,1536,7168,13,0,33.8481,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
128,3072,1536,13,0,14.1409,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
128,576,7168,8,0,19.6941,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
128,7168,256,13,0,9.0728,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
128,7168,2048,12,0,31.4937,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
128,4608,7168,18,0,61.6147,a8w8_blockscale_wpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
128,7168,2304,10,0,32.3685,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
128,512,7168,8,0,19.7121,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
128,4096,512,8,0,8.9284,a8w8_blockscale_wpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1
256,1536,7168,13,0,50.8314,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
256,3072,1536,13,0,21.5577,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
256,576,7168,13,0,24.1441,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
256,7168,256,7,0,13.3049,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
256,7168,2048,9,0,56.027,a8w8_blockscale_wpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
256,4608,7168,14,0,109.4776,a8w8_blockscale_wpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
256,7168,2304,15,0,56.3062,a8w8_blockscale_wpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
256,512,7168,13,0,23.8285,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
256,4096,512,7,0,12.9837,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
512,1536,7168,13,0,80.5739,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
512,3072,1536,10,0,37.0022,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
512,576,7168,12,0,34.7126,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
512,7168,256,7,0,21.8937,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
512,7168,2048,0,0,93.6516,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
512,4608,7168,0,0,179.194,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
512,7168,2304,0,0,87.4988,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
512,512,7168,18,0,35.4666,a8w8_blockscale_wpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
512,4096,512,13,0,20.5525,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
1024,1536,7168,18,0,140.8366,a8w8_blockscale_wpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
1024,3072,1536,9,0,65.4555,a8w8_blockscale_wpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
1024,576,7168,12,0,55.5282,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
1024,7168,256,7,0,38.4886,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
1024,7168,2048,0,0,170.8535,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1024,4608,7168,0,0,361.3259,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1024,7168,2304,0,0,161.2487,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1024,512,7168,12,0,59.4271,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
1024,4096,512,12,0,34.851,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
1536,1536,7168,0,0,185.4252,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1536,3072,1536,0,0,83.9684,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1536,576,7168,12,0,82.9763,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
1536,7168,256,4,0,54.707,a8w8_blockscale_wpreshuffle_1x128x128_256x16x256x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1
1536,7168,2048,0,0,253.9371,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1536,4608,7168,0,0,537.6935,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1536,7168,2304,0,0,237.1646,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
1536,512,7168,13,0,81.02,a8w8_blockscale_wpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1
1536,4096,512,12,0,49.8538,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
2048,1536,7168,9,0,265.9643,a8w8_blockscale_wpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
2048,3072,1536,0,0,105.3044,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
2048,576,7168,12,0,105.8376,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
2048,7168,256,4,0,70.4467,a8w8_blockscale_wpreshuffle_1x128x128_256x16x256x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1
2048,7168,2048,0,0,331.0246,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
2048,4608,7168,0,0,705.4126,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
2048,7168,2304,0,0,312.3609,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
2048,512,7168,14,0,108.9081,a8w8_blockscale_wpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v1
2048,4096,512,12,0,63.5987,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
4096,1536,7168,0,0,457.5092,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,3072,1536,0,0,198.6353,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,576,7168,0,0,195.5016,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,7168,256,4,0,133.113,a8w8_blockscale_wpreshuffle_1x128x128_256x16x256x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1
4096,7168,2048,0,0,623.5339,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,4608,7168,0,0,1308.8472,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,7168,2304,0,0,592.0273,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,512,7168,0,0,180.0848,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
4096,4096,512,12,0,120.5713,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
8192,1536,7168,0,0,891.405,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,3072,1536,0,0,387.0061,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,576,7168,0,0,379.1368,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,7168,256,4,0,258.6827,a8w8_blockscale_wpreshuffle_1x128x128_256x16x256x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1
8192,7168,2048,0,0,1208.9008,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,4608,7168,0,0,2517.7124,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,7168,2304,0,0,1154.5041,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
8192,512,7168,12,0,352.4495,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
8192,4096,512,12,0,233.8998,a8w8_blockscale_wpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x2_intrawave_v1
16384,1536,7168,0,0,1768.6828,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,3072,1536,0,0,749.7908,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,576,7168,0,0,743.4744,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,7168,256,7,0,532.4307,a8w8_blockscale_wpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1
16384,7168,2048,0,0,2454.5874,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,4608,7168,0,0,5013.9511,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,7168,2304,0,0,2296.1438,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,512,7168,0,0,649.6352,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
16384,4096,512,0,0,455.798,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,1536,7168,0,0,2140.6012,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,3072,1536,0,0,923.4112,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,576,7168,0,0,926.0608,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,7168,256,4,0,635.9159,a8w8_blockscale_wpreshuffle_1x128x128_256x16x256x128_8x16_16x16_16x16x1_8x32x1_1x16x1x16_8_1x2_intrawave_v1
20480,7168,2048,0,0,2992.9348,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,4608,7168,0,0,6238.7055,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,7168,2304,0,0,2895.9425,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,512,7168,0,0,753.5264,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
20480,4096,512,0,0,564.9256,a8w8_blockscale_wpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x2_intrawave_v3
234 changes: 234 additions & 0 deletions aiter/configs/a8w8_blockscale_wpreshuffle_untuned_gemm.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
M,N,K
16, 1536, 7168

16, 3072, 1536

16, 576, 7168

16, 7168, 256

16, 7168, 2048

16, 4608, 7168

16, 7168, 2304

16, 512, 7168

16, 4096, 512

32, 1536, 7168

32, 3072, 1536

32, 576, 7168

32, 7168, 256

32, 7168, 2048

32, 4608, 7168

32, 7168, 2304

32, 512, 7168

32, 4096, 512

64, 1536, 7168

64, 3072, 1536

64, 576, 7168

64, 7168, 256

64, 7168, 2048

64, 4608, 7168

64, 7168, 2304

64, 512, 7168

64, 4096, 512

128, 1536, 7168

128, 3072, 1536

128, 576, 7168

128, 7168, 256

128, 7168, 2048

128, 4608, 7168

128, 7168, 2304

128, 512, 7168

128, 4096, 512

256, 1536, 7168

256, 3072, 1536

256, 576, 7168

256, 7168, 256

256, 7168, 2048

256, 4608, 7168

256, 7168, 2304

256, 512, 7168

256, 4096, 512

512, 1536, 7168

512, 3072, 1536

512, 576, 7168

512, 7168, 256

512, 7168, 2048

512, 4608, 7168

512, 7168, 2304

512, 512, 7168

512, 4096, 512

1024, 1536, 7168

1024, 3072, 1536

1024, 576, 7168

1024, 7168, 256

1024, 7168, 2048

1024, 4608, 7168

1024, 7168, 2304

1024, 512, 7168

1024, 4096, 512

1536, 1536, 7168

1536, 3072, 1536

1536, 576, 7168

1536, 7168, 256

1536, 7168, 2048

1536, 4608, 7168

1536, 7168, 2304

1536, 512, 7168

1536, 4096, 512

2048, 1536, 7168

2048, 3072, 1536

2048, 576, 7168

2048, 7168, 256

2048, 7168, 2048

2048, 4608, 7168

2048, 7168, 2304

2048, 512, 7168

2048, 4096, 512

4096, 1536, 7168

4096, 3072, 1536

4096, 576, 7168

4096, 7168, 256

4096, 7168, 2048

4096, 4608, 7168

4096, 7168, 2304

4096, 512, 7168

4096, 4096, 512

8192, 1536, 7168

8192, 3072, 1536

8192, 576, 7168

8192, 7168, 256

8192, 7168, 2048

8192, 4608, 7168

8192, 7168, 2304

8192, 512, 7168

8192, 4096, 512

16384, 1536, 7168

16384, 3072, 1536

16384, 576, 7168

16384, 7168, 256

16384, 7168, 2048

16384, 4608, 7168

16384, 7168, 2304

16384, 512, 7168

16384, 4096, 512

20480, 1536, 7168

20480, 3072, 1536

20480, 576, 7168

20480, 7168, 256

20480, 7168, 2048

20480, 4608, 7168

20480, 7168, 2304

20480, 512, 7168

20480, 4096, 512
30 changes: 30 additions & 0 deletions aiter/jit/optCompilerConfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,36 @@
"verbose": "False",
"blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale/gen_instances.py --working_path {{}} --tune'"
},
"module_gemm_a8w8_blockscale_wpreshuffle": {
"srcs": [
"f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/include'",
"f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_blockscale_wpreshuffle_pybind.cu'",
"f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/gemm_a8w8_blockscale_wpreshuffle.cu'"
],
"flags_extra_cc": [],
"flags_extra_hip": [],
"extra_ldflags": "None",
"extra_include": [],
"verbose": "False",
"is_python_module": "True",
"is_standalone": "False",
"blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/gen_instances.py --working_path {{}} --tune_file {AITER_CORE_DIR}/aiter/configs/a8w8_blockscale_wpreshuffle_tuned_gemm.csv'"
},
"module_gemm_a8w8_blockscale_wpreshuffle_tune": {
"srcs": [
"f'{AITER_CSRC_DIR}/pybind/gemm_a8w8_blockscale_wpreshuffle_tune_pybind.cu'",
"f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/gemm_a8w8_blockscale_wpreshuffle_tune.cu'",
"f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/include'"
],
"flags_extra_cc": [],
"flags_extra_hip": [],
"extra_ldflags": "None",
"extra_include": [],
"verbose": "False",
"is_python_module": "True",
"is_standalone": "False",
"blob_gen_cmd": "f'{AITER_CSRC_DIR}/ck_gemm_a8w8_blockscale_wpreshuffle/gen_instances.py --working_path {{}} --tune'"
},
"module_aiter_operator": {
"srcs": [
"f'{AITER_CSRC_DIR}/pybind/aiter_operator_pybind.cu'",
Expand Down
Loading