From 14516d53509822cbbc1f9ab0e82440318d8a74d1 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Tue, 26 Mar 2024 15:16:24 +0200 Subject: [PATCH 1/9] first try to use NEON intrinsics for DCT, WIP --- dct_neon.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 dct_neon.c diff --git a/dct_neon.c b/dct_neon.c new file mode 100644 index 0000000..8bd5173 --- /dev/null +++ b/dct_neon.c @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include + +//d contains the pixel values of a 4x4 block that needs to be transformed +//this function computes the dct on the input data (d) and stores the result +//into d +//REF: https://code.videolan.org/videolan/x264/-/blob/master/common/dct.c?ref_type=heads +static void dct4x4dc( int d[16] ) +{ + //hold the intermediate results + int tmp[16]; + + //iterate over each row of the 4x4 block (phase 1) + for( int i = 0; i < 4; i++ ) + { + + /* + printf("Row %d:\n", i); + printf("d[%d] = %d, d[%d] = %d\n", i * 4 + 0, d[i * 4 + 0], i * 4 + 1, d[i * 4 + 1]); + printf("d[%d] = %d, d[%d] = %d\n", i * 4 + 2, d[i * 4 + 2], i * 4 + 3, d[i * 4 + 3]); + */ + + int s01 = d[i*4+0] + d[i*4+1]; //sum of the 1st and 2nd elements in the row + int d01 = d[i*4+0] - d[i*4+1]; //diff between the 1st and the 2nd elements in the row + int s23 = d[i*4+2] + d[i*4+3]; //sum of the 3d and 4th elements in the row + int d23 = d[i*4+2] - d[i*4+3]; //diff between the 3d and the 4th elements in the row + + + tmp[0*4+i] = s01 + s23; //1st element of the row + tmp[1*4+i] = s01 - s23; //2nd element of the row + tmp[2*4+i] = d01 - d23; //3d element of the row + tmp[3*4+i] = d01 + d23; //4th element of the row + + printf("s01 = %d, s23 = %d\n", s01, s23); + printf("d01 = %d, d23 = %d\n", d01, d23); + printf("tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d\n", 0*4+i, tmp[0*4+i], 1*4+i, tmp[1*4+i],2*4+i, tmp[2*4+i], 3*4+i,tmp[3*4+i] ); + printf("\n"); + } + + + //iterates over each row of the 4x4 block (phase 2) + for( int i = 0; i < 4; i++ ) + { + int s01 = tmp[i*4+0] + tmp[i*4+1]; + int d01 = tmp[i*4+0] - tmp[i*4+1]; + int s23 = tmp[i*4+2] + tmp[i*4+3]; + int d23 = tmp[i*4+2] - tmp[i*4+3]; + + //The DCT coefficients are scaled by adding 1 and then right-shifting + //by 1 (equivalent to integer division by 2) for rounding. + d[i*4+0] = ( s01 + s23 + 1 ) >> 1; + d[i*4+1] = ( s01 - s23 + 1 ) >> 1; + d[i*4+2] = ( d01 - d23 + 1 ) >> 1; + d[i*4+3] = ( d01 + d23 + 1 ) >> 1; + + } +} + +// NEON version +static void dct4x4dc_neon( int *d ) +{ + //hold the intermediate results + int *tmp = (int *)aligned_alloc(16, 16 * sizeof(int)); + printf("\n"); + //phase 1 + for( int i = 0; i < 4; i++ ){ + + int32x4_t input_row = vld1q_s32(&d[i*4]); + + //s01, s23: s01+s23 -> tmp[0] and s01-s23 ->tmp[1] + int32x2_t sum_halves = vpadd_s32(vget_low_s32(input_row), vget_high_s32(input_row)); + tmp[0*4+i] = vaddvq_s32(input_row); //vget_lane_s32(sum_halves, 0)+vget_lane_s32(sum_halves, 1); + tmp[1*4+i] = vget_lane_s32(sum_halves, 0)-vget_lane_s32(sum_halves, 1); + printf("s01 = %d, s23 = %d\n", vget_lane_s32(sum_halves, 0), vget_lane_s32(sum_halves, 1)); + + //d01, d23: d01-d23 -> tmp[2] and d01+d23 -> tmp[3] + int32x2_t sub_halves = vsub_s32(vget_low_s32(input_row), vget_high_s32(input_row)); + tmp[2*4+i] = vget_lane_s32(sub_halves, 0)-vget_lane_s32(sub_halves, 1); + tmp[3*4+i] = vget_lane_s32(sub_halves, 0)+vget_lane_s32(sub_halves, 1); + printf("d01 = %d, d23 = %d\n", vget_lane_s32(sub_halves, 0), vget_lane_s32(sub_halves, 1)); + + printf("tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d\n", 0*4+i, tmp[0*4+i], 1*4+i, tmp[1*4+i],2*4+i, tmp[2*4+i], 3*4+i,tmp[3*4+i] ); + printf("\n"); + + + + } + + //iterates over each row of the 4x4 block (phase 2) + for( int i = 0; i < 4; i++ ) + { + int s01 = tmp[i*4+0] + tmp[i*4+1]; + int d01 = tmp[i*4+0] - tmp[i*4+1]; + int s23 = tmp[i*4+2] + tmp[i*4+3]; + int d23 = tmp[i*4+2] - tmp[i*4+3]; + + //The DCT coefficients are scaled by adding 1 and then right-shifting + //by 1 (equivalent to integer division by 2) for rounding. + d[i*4+0] = ( s01 + s23 + 1 ) >> 1; + d[i*4+1] = ( s01 - s23 + 1 ) >> 1; + d[i*4+2] = ( d01 - d23 + 1 ) >> 1; + d[i*4+3] = ( d01 + d23 + 1 ) >> 1; + + } +} + + +int main(int argc, char **argv) { + + //handle user's arguement + long int LOOPS = 1; + + if (argc == 2) { + char *endptr; + LOOPS = strtol(argv[1], &endptr, 10); + + // check for conversion errors + if (*endptr != '\0' || argv[1][0] == '0') { + fprintf(stderr, "Error: Invalid input\n"); + return EXIT_FAILURE; + } + } + + + //seed, times & memory allignment + srand(time(NULL)); + struct timeval tv1, tv2, tv3, tv4, diff1, diff2; + + + int *d = (int *)aligned_alloc(16, 16 * sizeof(int)); + int *dd = (int *)aligned_alloc(16, 16 * sizeof(int)); + if (d == NULL || dd == NULL) { + printf("Memory allocation failed\n"); + return 1; + } + + //initialize original matrix d + for (int i = 0; i < 16; i++) { + int random_value = rand() % 256; //integers between 0-255 + d[i] = random_value; + dd[i] = random_value; + } + + //call SCALAR function + gettimeofday(&tv1, NULL); + for (int loops = 0; loops < LOOPS; loops++) { + dct4x4dc(d); + } + gettimeofday(&tv2, NULL); + + + //print the transformed matrix + printf("Transformed Matrix (dct) from Scalar function:\n"); + for (int i = 0; i < 16; i++) { + printf("%3d ", d[i]); + if ((i + 1) % 4 == 0) + printf("\n"); + } + + printf("--------------------------------------\n"); + + //call NEON function + gettimeofday(&tv3, NULL); + for (int loops = 0; loops < LOOPS; loops++) { + dct4x4dc_neon(dd); + } + gettimeofday(&tv4, NULL); + + + //print the transformed matrix + printf("Transformed Matrix (dct) from NEON function:\n"); + for (int i = 0; i < 16; i++) { + printf("%3d ", dd[i]); + if ((i + 1) % 4 == 0) + printf("\n"); + } + + printf("\n"); + diff1.tv_sec = tv2.tv_sec - tv1.tv_sec; + diff1.tv_usec = tv2.tv_usec + (1000000 - tv1.tv_usec); + diff2.tv_sec = tv4.tv_sec - tv3.tv_sec; + diff2.tv_usec = tv4.tv_usec + (1000000 - tv3.tv_usec); + + printf("Scalar DCT: %ld sec, usec: %d\n", diff1.tv_sec, + diff1.tv_usec); + printf("NEON DCT: %ld sec, usec: %d\n", diff2.tv_sec, + diff2.tv_usec); + + free(d); + free(dd); + return 0; +} \ No newline at end of file From b862fdde093077722e051f2aed8bb69c4140ed4b Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Tue, 26 Mar 2024 18:59:13 +0200 Subject: [PATCH 2/9] correct results from the first loop, need to optimize the second loop --- dct_neon.c | 343 ++++++++++++++++++++++++++--------------------------- 1 file changed, 167 insertions(+), 176 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 8bd5173..f05da02 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -3,195 +3,186 @@ #include #include #include -#include #include +#include -//d contains the pixel values of a 4x4 block that needs to be transformed -//this function computes the dct on the input data (d) and stores the result -//into d -//REF: https://code.videolan.org/videolan/x264/-/blob/master/common/dct.c?ref_type=heads -static void dct4x4dc( int d[16] ) -{ - //hold the intermediate results - int tmp[16]; - - //iterate over each row of the 4x4 block (phase 1) - for( int i = 0; i < 4; i++ ) - { - - /* - printf("Row %d:\n", i); - printf("d[%d] = %d, d[%d] = %d\n", i * 4 + 0, d[i * 4 + 0], i * 4 + 1, d[i * 4 + 1]); - printf("d[%d] = %d, d[%d] = %d\n", i * 4 + 2, d[i * 4 + 2], i * 4 + 3, d[i * 4 + 3]); - */ - - int s01 = d[i*4+0] + d[i*4+1]; //sum of the 1st and 2nd elements in the row - int d01 = d[i*4+0] - d[i*4+1]; //diff between the 1st and the 2nd elements in the row - int s23 = d[i*4+2] + d[i*4+3]; //sum of the 3d and 4th elements in the row - int d23 = d[i*4+2] - d[i*4+3]; //diff between the 3d and the 4th elements in the row - - - tmp[0*4+i] = s01 + s23; //1st element of the row - tmp[1*4+i] = s01 - s23; //2nd element of the row - tmp[2*4+i] = d01 - d23; //3d element of the row - tmp[3*4+i] = d01 + d23; //4th element of the row - - printf("s01 = %d, s23 = %d\n", s01, s23); - printf("d01 = %d, d23 = %d\n", d01, d23); - printf("tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d\n", 0*4+i, tmp[0*4+i], 1*4+i, tmp[1*4+i],2*4+i, tmp[2*4+i], 3*4+i,tmp[3*4+i] ); - printf("\n"); - } - - - //iterates over each row of the 4x4 block (phase 2) - for( int i = 0; i < 4; i++ ) - { - int s01 = tmp[i*4+0] + tmp[i*4+1]; - int d01 = tmp[i*4+0] - tmp[i*4+1]; - int s23 = tmp[i*4+2] + tmp[i*4+3]; - int d23 = tmp[i*4+2] - tmp[i*4+3]; - - //The DCT coefficients are scaled by adding 1 and then right-shifting - //by 1 (equivalent to integer division by 2) for rounding. - d[i*4+0] = ( s01 + s23 + 1 ) >> 1; - d[i*4+1] = ( s01 - s23 + 1 ) >> 1; - d[i*4+2] = ( d01 - d23 + 1 ) >> 1; - d[i*4+3] = ( d01 + d23 + 1 ) >> 1; - - } +// d contains the pixel values of a 4x4 block that needs to be transformed +// this function computes the dct on the input data (d) and stores the result +// into d +// REF: +// https://code.videolan.org/videolan/x264/-/blob/master/common/dct.c?ref_type=heads +static void dct4x4dc(int d[16]) { + // hold the intermediate results + int tmp[16]; + + // iterate over each row of the 4x4 block (phase 1) + for (int i = 0; i < 4; i++) { + int s01 = d[i * 4 + 0] + + d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row + int d01 = + d[i * 4 + 0] - + d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row + int s23 = d[i * 4 + 2] + + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row + int d23 = + d[i * 4 + 2] - + d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row + + tmp[0 * 4 + i] = s01 + s23; // 1st element of the row + tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row + tmp[2 * 4 + i] = d01 - d23; // 3d element of the row + tmp[3 * 4 + i] = d01 + d23; // 4th element of the row + } + + // iterates over each row of the 4x4 block (phase 2) + for (int i = 0; i < 4; i++) { + + int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; + int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; + int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3]; + int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3]; + + // The DCT coefficients are scaled by adding 1 and then right-shifting + // by 1 (equivalent to integer division by 2) for rounding. + d[i * 4 + 0] = (s01 + s23 + 1) >> 1; + d[i * 4 + 1] = (s01 - s23 + 1) >> 1; + d[i * 4 + 2] = (d01 - d23 + 1) >> 1; + d[i * 4 + 3] = (d01 + d23 + 1) >> 1; + } } -// NEON version -static void dct4x4dc_neon( int *d ) -{ - //hold the intermediate results - int *tmp = (int *)aligned_alloc(16, 16 * sizeof(int)); - printf("\n"); - //phase 1 - for( int i = 0; i < 4; i++ ){ - - int32x4_t input_row = vld1q_s32(&d[i*4]); - - //s01, s23: s01+s23 -> tmp[0] and s01-s23 ->tmp[1] - int32x2_t sum_halves = vpadd_s32(vget_low_s32(input_row), vget_high_s32(input_row)); - tmp[0*4+i] = vaddvq_s32(input_row); //vget_lane_s32(sum_halves, 0)+vget_lane_s32(sum_halves, 1); - tmp[1*4+i] = vget_lane_s32(sum_halves, 0)-vget_lane_s32(sum_halves, 1); - printf("s01 = %d, s23 = %d\n", vget_lane_s32(sum_halves, 0), vget_lane_s32(sum_halves, 1)); - - //d01, d23: d01-d23 -> tmp[2] and d01+d23 -> tmp[3] - int32x2_t sub_halves = vsub_s32(vget_low_s32(input_row), vget_high_s32(input_row)); - tmp[2*4+i] = vget_lane_s32(sub_halves, 0)-vget_lane_s32(sub_halves, 1); - tmp[3*4+i] = vget_lane_s32(sub_halves, 0)+vget_lane_s32(sub_halves, 1); - printf("d01 = %d, d23 = %d\n", vget_lane_s32(sub_halves, 0), vget_lane_s32(sub_halves, 1)); - - printf("tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d, tmp[%d]=%d\n", 0*4+i, tmp[0*4+i], 1*4+i, tmp[1*4+i],2*4+i, tmp[2*4+i], 3*4+i,tmp[3*4+i] ); - printf("\n"); - - - - } - - //iterates over each row of the 4x4 block (phase 2) - for( int i = 0; i < 4; i++ ) - { - int s01 = tmp[i*4+0] + tmp[i*4+1]; - int d01 = tmp[i*4+0] - tmp[i*4+1]; - int s23 = tmp[i*4+2] + tmp[i*4+3]; - int d23 = tmp[i*4+2] - tmp[i*4+3]; - - //The DCT coefficients are scaled by adding 1 and then right-shifting - //by 1 (equivalent to integer division by 2) for rounding. - d[i*4+0] = ( s01 + s23 + 1 ) >> 1; - d[i*4+1] = ( s01 - s23 + 1 ) >> 1; - d[i*4+2] = ( d01 - d23 + 1 ) >> 1; - d[i*4+3] = ( d01 + d23 + 1 ) >> 1; - - } +// NEON version +static void dct4x4dc_neon(int *d) { + // hold the intermediate results + int tmp[16]; + + // phase 1 + for (int i = 0; i < 4; i++) { + // Load the 4 elements of the current row into NEON registers + int32x4_t row = vld1q_s32(&d[i * 4]); + + // Compute sum of adjacent pairs (s01, s23) + int32x2_t sum_halves = vpadd_s32(vget_low_s32(row), vget_high_s32(row)); + tmp[0 * 4 + i] = vaddvq_s32(row); + tmp[1 * 4 + i] = + vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1); + + // Compute differences between adjacent elements + int32x2_t d01 = vsub_s32(vget_low_s32(row), + vext_s32(vget_low_s32(row), vget_low_s32(row), 1)); + int32x2_t d23 = + vsub_s32(vget_high_s32(row), + vext_s32(vget_high_s32(row), vget_high_s32(row), 1)); + tmp[2 * 4 + i] = vget_lane_s32(d01, 0) - vget_lane_s32(d23, 0); + tmp[3 * 4 + i] = vget_lane_s32(d01, 0) + vget_lane_s32(d23, 0); + } + + // iterates over each row of the 4x4 block (phase 2) + for (int i = 0; i < 4; i++) { + int32x4_t tmp_row = vld1q_s32(&d[i * 4]); + + // contains s01, s23 + int32x2_t sum_halves = + vpadd_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); + // contains d01, d23 + int32x2_t sub_halves = + vsub_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); + + /* + int s01 = tmp[i*4+0] + tmp[i*4+1]; //tmp[0]+tmp[1] + int d01 = tmp[i*4+0] - tmp[i*4+1]; //tmp[0]-tmp[1] + int s23 = tmp[i*4+2] + tmp[i*4+3]; //tmp[2]+tmp[3] + int d23 = tmp[i*4+2] - tmp[i*4+3]; //tmp[2]-tmp[3] + */ + + // The DCT coefficients are scaled by adding 1 and then right-shifting + // by 1 (equivalent to integer division by 2) for rounding. + d[i * 4 + 0] = (s01 + s23 + 1) >> 1; + d[i * 4 + 1] = (s01 - s23 + 1) >> 1; + d[i * 4 + 2] = (d01 - d23 + 1) >> 1; + d[i * 4 + 3] = (d01 + d23 + 1) >> 1; + } } - int main(int argc, char **argv) { - //handle user's arguement - long int LOOPS = 1; + // handle user's arguement + long int LOOPS = 10000000000; - if (argc == 2) { - char *endptr; - LOOPS = strtol(argv[1], &endptr, 10); + if (argc == 2) { + char *endptr; + LOOPS = strtol(argv[1], &endptr, 10); - // check for conversion errors - if (*endptr != '\0' || argv[1][0] == '0') { - fprintf(stderr, "Error: Invalid input\n"); - return EXIT_FAILURE; - } + // check for conversion errors + if (*endptr != '\0' || argv[1][0] == '0') { + fprintf(stderr, "Error: Invalid input\n"); + return EXIT_FAILURE; } - - - //seed, times & memory allignment - srand(time(NULL)); - struct timeval tv1, tv2, tv3, tv4, diff1, diff2; - - - int *d = (int *)aligned_alloc(16, 16 * sizeof(int)); - int *dd = (int *)aligned_alloc(16, 16 * sizeof(int)); - if (d == NULL || dd == NULL) { - printf("Memory allocation failed\n"); - return 1; - } - - //initialize original matrix d + } + + // seed, times, arrays + srand(time(NULL)); + struct timeval tv1, tv2, tv3, tv4, diff1, diff2; + + int d[16]; + int *dd = NULL; + if (posix_memalign((void **)&dd, 16, 16 * sizeof(int)) != 0) { + perror("posix_memalign failed"); + exit(EXIT_FAILURE); + } + int random_value[16]; + + // initialize original matrix d + for (int i = 0; i < 16; i++) { + random_value[i] = rand() % 256; // integers between 0-255 + } + + // call SCALAR function + gettimeofday(&tv1, NULL); + for (int loops = 0; loops < LOOPS; loops++) { for (int i = 0; i < 16; i++) { - int random_value = rand() % 256; //integers between 0-255 - d[i] = random_value; - dd[i] = random_value; + d[i] = random_value[i]; } - - //call SCALAR function - gettimeofday(&tv1, NULL); - for (int loops = 0; loops < LOOPS; loops++) { - dct4x4dc(d); - } - gettimeofday(&tv2, NULL); - - - //print the transformed matrix - printf("Transformed Matrix (dct) from Scalar function:\n"); - for (int i = 0; i < 16; i++) { - printf("%3d ", d[i]); - if ((i + 1) % 4 == 0) - printf("\n"); - } - - printf("--------------------------------------\n"); - - //call NEON function - gettimeofday(&tv3, NULL); - for (int loops = 0; loops < LOOPS; loops++) { - dct4x4dc_neon(dd); - } - gettimeofday(&tv4, NULL); - - - //print the transformed matrix - printf("Transformed Matrix (dct) from NEON function:\n"); + dct4x4dc(d); + } + gettimeofday(&tv2, NULL); + + // print the transformed matrix + printf("Transformed Matrix (dct) from Scalar function:\n"); + for (int i = 0; i < 16; i++) { + printf("%3d ", d[i]); + if ((i + 1) % 4 == 0) + printf("\n"); + } + + printf("--------------------------------------\n"); + + // call NEON function + gettimeofday(&tv3, NULL); + for (int loops = 0; loops < LOOPS; loops++) { for (int i = 0; i < 16; i++) { - printf("%3d ", dd[i]); - if ((i + 1) % 4 == 0) - printf("\n"); + dd[i] = random_value[i]; } - - printf("\n"); - diff1.tv_sec = tv2.tv_sec - tv1.tv_sec; - diff1.tv_usec = tv2.tv_usec + (1000000 - tv1.tv_usec); - diff2.tv_sec = tv4.tv_sec - tv3.tv_sec; - diff2.tv_usec = tv4.tv_usec + (1000000 - tv3.tv_usec); - - printf("Scalar DCT: %ld sec, usec: %d\n", diff1.tv_sec, - diff1.tv_usec); - printf("NEON DCT: %ld sec, usec: %d\n", diff2.tv_sec, - diff2.tv_usec); - - free(d); - free(dd); - return 0; + dct4x4dc_neon(dd); + } + gettimeofday(&tv4, NULL); + + // print the transformed matrix + printf("Transformed Matrix (dct) from NEON function:\n"); + for (int i = 0; i < 16; i++) { + printf("%3d ", dd[i]); + if ((i + 1) % 4 == 0) + printf("\n"); + } + + printf("\n"); + diff1.tv_sec = tv2.tv_sec - tv1.tv_sec; + diff1.tv_usec = tv2.tv_usec + (1000000 - tv1.tv_usec); + diff2.tv_sec = tv4.tv_sec - tv3.tv_sec; + diff2.tv_usec = tv4.tv_usec + (1000000 - tv3.tv_usec); + + printf("Scalar DCT: %ld sec, usec: %d\n", diff1.tv_sec, diff1.tv_usec); + printf("NEON DCT: %ld sec, usec: %d\n", diff2.tv_sec, diff2.tv_usec); + + return 0; } \ No newline at end of file From 3c2be5522092eeaa5768fff52affb209c06f9440 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Tue, 26 Mar 2024 20:40:08 +0200 Subject: [PATCH 3/9] results are correct, added loops that are given by the user, timing is bad --- dct_neon.c | 82 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index f05da02..83e4e1b 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -11,22 +11,16 @@ // into d // REF: // https://code.videolan.org/videolan/x264/-/blob/master/common/dct.c?ref_type=heads -static void dct4x4dc(int d[16]) { +static void dct4x4dc_c(int d[16]) { // hold the intermediate results int tmp[16]; // iterate over each row of the 4x4 block (phase 1) for (int i = 0; i < 4; i++) { - int s01 = d[i * 4 + 0] + - d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row - int d01 = - d[i * 4 + 0] - - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row - int s23 = d[i * 4 + 2] + - d[i * 4 + 3]; // sum of the 3d and 4th elements in the row - int d23 = - d[i * 4 + 2] - - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row + int s01 = d[i * 4 + 0] + d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row + int d01 = d[i * 4 + 0] - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row + int s23 = d[i * 4 + 2] + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row + int d23 = d[i * 4 + 2] - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row tmp[0 * 4 + i] = s01 + s23; // 1st element of the row tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row @@ -61,46 +55,46 @@ static void dct4x4dc_neon(int *d) { // Load the 4 elements of the current row into NEON registers int32x4_t row = vld1q_s32(&d[i * 4]); - // Compute sum of adjacent pairs (s01, s23) - int32x2_t sum_halves = vpadd_s32(vget_low_s32(row), vget_high_s32(row)); + //s01+s23 = d1+d2+d3+d4 tmp[0 * 4 + i] = vaddvq_s32(row); - tmp[1 * 4 + i] = - vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1); - - // Compute differences between adjacent elements - int32x2_t d01 = vsub_s32(vget_low_s32(row), - vext_s32(vget_low_s32(row), vget_low_s32(row), 1)); - int32x2_t d23 = - vsub_s32(vget_high_s32(row), - vext_s32(vget_high_s32(row), vget_high_s32(row), 1)); + + //first element is s01, second element is s23 and then I perform s01-s23 + int32x2_t sum_halves = vpadd_s32(vget_low_s32(row), vget_high_s32(row)); + tmp[1 * 4 + i] = vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1); + + //Extracts the low half of the elements from the row, which is [d1,d2] + //extends the low half of the row, shifting the elements by 1 position to the right, so it becomes [d2, 0] + //Subtracts the extended low half [d2, 0] from the original low half [d1, d2], element-wise + //d01 = [d1 - d2, d2 - 0] = [d1 - d2, d2] + //same for d23 + + int32x2_t d01 = vsub_s32(vget_low_s32(row), vext_s32(vget_low_s32(row), vget_low_s32(row), 1)); + int32x2_t d23 = vsub_s32(vget_high_s32(row),vext_s32(vget_high_s32(row), vget_high_s32(row), 1)); tmp[2 * 4 + i] = vget_lane_s32(d01, 0) - vget_lane_s32(d23, 0); tmp[3 * 4 + i] = vget_lane_s32(d01, 0) + vget_lane_s32(d23, 0); } - // iterates over each row of the 4x4 block (phase 2) + // phase 2 for (int i = 0; i < 4; i++) { - int32x4_t tmp_row = vld1q_s32(&d[i * 4]); - // contains s01, s23 - int32x2_t sum_halves = - vpadd_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); - // contains d01, d23 - int32x2_t sub_halves = - vsub_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); - - /* - int s01 = tmp[i*4+0] + tmp[i*4+1]; //tmp[0]+tmp[1] - int d01 = tmp[i*4+0] - tmp[i*4+1]; //tmp[0]-tmp[1] - int s23 = tmp[i*4+2] + tmp[i*4+3]; //tmp[2]+tmp[3] - int d23 = tmp[i*4+2] - tmp[i*4+3]; //tmp[2]-tmp[3] - */ + //follow the same logic as phase 1 + int32x4_t tmp_row = vld1q_s32(&tmp[i * 4]); + int32x4_t d_vector = vdupq_n_s32(0); + int32x4_t one_vector = vdupq_n_s32(1); - // The DCT coefficients are scaled by adding 1 and then right-shifting - // by 1 (equivalent to integer division by 2) for rounding. - d[i * 4 + 0] = (s01 + s23 + 1) >> 1; - d[i * 4 + 1] = (s01 - s23 + 1) >> 1; - d[i * 4 + 2] = (d01 - d23 + 1) >> 1; - d[i * 4 + 3] = (d01 + d23 + 1) >> 1; + // contains s01, s23 + int32x2_t sum_halves = vpadd_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); + d_vector = vsetq_lane_s32(vaddvq_s32(tmp_row), d_vector, 0); + d_vector = vsetq_lane_s32(vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1), d_vector, 1); + + int32x2_t d01 = vsub_s32(vget_low_s32(tmp_row), vext_s32(vget_low_s32(tmp_row), vget_low_s32(tmp_row), 1)); + int32x2_t d23 = vsub_s32(vget_high_s32(tmp_row),vext_s32(vget_high_s32(tmp_row), vget_high_s32(tmp_row), 1)); + d_vector = vsetq_lane_s32(vget_lane_s32(d01, 0) - vget_lane_s32(d23, 0), d_vector, 2); + d_vector = vsetq_lane_s32(vget_lane_s32(d01, 0) + vget_lane_s32(d23, 0), d_vector, 3); + + //add 1 to each element and devide by 2 with the use of shifting right + d_vector=vshrq_n_s32(vaddq_s32(d_vector, one_vector),1); + vst1q_s32(&d[i * 4], d_vector); } } @@ -143,7 +137,7 @@ int main(int argc, char **argv) { for (int i = 0; i < 16; i++) { d[i] = random_value[i]; } - dct4x4dc(d); + dct4x4dc_c(d); } gettimeofday(&tv2, NULL); From 6c75b4946a839a68dbc41127b93019fcf0d11668 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Wed, 27 Mar 2024 13:53:52 +0200 Subject: [PATCH 4/9] adding elements in vertical way, achieved speedup from the last commit, not ready yet --- dct_neon.c | 87 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 83e4e1b..97972e7 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -21,7 +21,7 @@ static void dct4x4dc_c(int d[16]) { int d01 = d[i * 4 + 0] - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row int s23 = d[i * 4 + 2] + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row int d23 = d[i * 4 + 2] - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row - + //printf("s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, d23); tmp[0 * 4 + i] = s01 + s23; // 1st element of the row tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row tmp[2 * 4 + i] = d01 - d23; // 3d element of the row @@ -45,54 +45,79 @@ static void dct4x4dc_c(int d[16]) { } } +void print_int32x4(const char* label, int32x4_t vector) { + int32_t data[4]; + vst1q_s32(data, vector); + printf("%s: [%d %d %d %d]\n", label, data[0], data[1], data[2], data[3]); +} + // NEON version static void dct4x4dc_neon(int *d) { // hold the intermediate results int tmp[16]; + int32x4_t one_vector = vdupq_n_s32(1); - // phase 1 + // iterate over each row of the 4x4 block (phase 1) for (int i = 0; i < 4; i++) { - // Load the 4 elements of the current row into NEON registers - int32x4_t row = vld1q_s32(&d[i * 4]); - //s01+s23 = d1+d2+d3+d4 - tmp[0 * 4 + i] = vaddvq_s32(row); + //[a b c d] + int32x4_t row1 = vld1q_s32(&d[i * 4]); + + //[b c d a] by shuffling the first row + int32x4_t shuffled_row = vextq_s32(row1, row1, 1); + + //[s01 rand s23 rand] + int32x4_t result_add = vaddq_s32(row1, shuffled_row); + //[s23 rand s01 rand] + int32x4_t result_add_shuff = vextq_s32(result_add, result_add, 2); + - //first element is s01, second element is s23 and then I perform s01-s23 - int32x2_t sum_halves = vpadd_s32(vget_low_s32(row), vget_high_s32(row)); - tmp[1 * 4 + i] = vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1); + //[d01 rand d23 rand] + int32x4_t result_sub = vsubq_s32(row1, shuffled_row); + //[d23 rand d01 rand] + int32x4_t result_sub_shuff = vextq_s32(result_sub, result_sub, 2); - //Extracts the low half of the elements from the row, which is [d1,d2] - //extends the low half of the row, shifting the elements by 1 position to the right, so it becomes [d2, 0] - //Subtracts the extended low half [d2, 0] from the original low half [d1, d2], element-wise - //d01 = [d1 - d2, d2 - 0] = [d1 - d2, d2] - //same for d23 + int32x4_t tmp0 =vaddq_s32(result_add, result_add_shuff); + int32x4_t tmp1 =vsubq_s32(result_add, result_add_shuff); + int32x4_t tmp2 =vsubq_s32(result_sub, result_sub_shuff); + int32x4_t tmp3 =vaddq_s32(result_sub, result_sub_shuff); - int32x2_t d01 = vsub_s32(vget_low_s32(row), vext_s32(vget_low_s32(row), vget_low_s32(row), 1)); - int32x2_t d23 = vsub_s32(vget_high_s32(row),vext_s32(vget_high_s32(row), vget_high_s32(row), 1)); - tmp[2 * 4 + i] = vget_lane_s32(d01, 0) - vget_lane_s32(d23, 0); - tmp[3 * 4 + i] = vget_lane_s32(d01, 0) + vget_lane_s32(d23, 0); + tmp[0 * 4 + i] = vgetq_lane_s32(tmp0, 0); + tmp[1 * 4 + i] = vgetq_lane_s32(tmp1, 0); + tmp[2 * 4 + i] = vgetq_lane_s32(tmp2, 0); + tmp[3 * 4 + i] = vgetq_lane_s32(tmp3, 0); + } - // phase 2 + // iterates over each row of the 4x4 block (phase 2) for (int i = 0; i < 4; i++) { - //follow the same logic as phase 1 - int32x4_t tmp_row = vld1q_s32(&tmp[i * 4]); int32x4_t d_vector = vdupq_n_s32(0); - int32x4_t one_vector = vdupq_n_s32(1); + + //[tmp0 tmp1 tmp2 tmp3] + int32x4_t row1 = vld1q_s32(&tmp[i * 4]); - // contains s01, s23 - int32x2_t sum_halves = vpadd_s32(vget_low_s32(tmp_row), vget_high_s32(tmp_row)); - d_vector = vsetq_lane_s32(vaddvq_s32(tmp_row), d_vector, 0); - d_vector = vsetq_lane_s32(vget_lane_s32(sum_halves, 0) - vget_lane_s32(sum_halves, 1), d_vector, 1); + //[tmp1 tmp2 tmp3 tmp0] by shuffling the first row + int32x4_t shuffled_row = vextq_s32(row1, row1, 1); - int32x2_t d01 = vsub_s32(vget_low_s32(tmp_row), vext_s32(vget_low_s32(tmp_row), vget_low_s32(tmp_row), 1)); - int32x2_t d23 = vsub_s32(vget_high_s32(tmp_row),vext_s32(vget_high_s32(tmp_row), vget_high_s32(tmp_row), 1)); - d_vector = vsetq_lane_s32(vget_lane_s32(d01, 0) - vget_lane_s32(d23, 0), d_vector, 2); - d_vector = vsetq_lane_s32(vget_lane_s32(d01, 0) + vget_lane_s32(d23, 0), d_vector, 3); + //element 1 = s01, element 3 = s23 + int32x4_t result_add = vaddq_s32(row1, shuffled_row); + int32x4_t result_add_shuff = vextq_s32(result_add, result_add, 2); + + //element 1 = d01, element 3 = d23 + int32x4_t result_sub = vsubq_s32(row1, shuffled_row); + int32x4_t result_sub_shuff = vextq_s32(result_sub, result_sub, 2); + + int32x4_t tmp0 =vaddq_s32(result_add, result_add_shuff); + int32x4_t tmp1 =vsubq_s32(result_add, result_add_shuff); + int32x4_t tmp2 =vsubq_s32(result_sub, result_sub_shuff); + int32x4_t tmp3 =vaddq_s32(result_sub, result_sub_shuff); + + d_vector= vsetq_lane_s32(vgetq_lane_s32(tmp0, 0),d_vector,0); + d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp1, 0),d_vector,1); + d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp2, 0),d_vector,2); + d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp3, 0),d_vector,3); - //add 1 to each element and devide by 2 with the use of shifting right d_vector=vshrq_n_s32(vaddq_s32(d_vector, one_vector),1); vst1q_s32(&d[i * 4], d_vector); } From 6653146422f0a153b4e91d737b8481a0226c6ee2 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Wed, 27 Mar 2024 22:42:54 +0200 Subject: [PATCH 5/9] changed my whole thinking --- dct_neon.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 97972e7..608660c 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -26,22 +26,25 @@ static void dct4x4dc_c(int d[16]) { tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row tmp[2 * 4 + i] = d01 - d23; // 3d element of the row tmp[3 * 4 + i] = d01 + d23; // 4th element of the row + //printf("tmp[0]: %d, tmp[1]: %d, tmp[2]: %d, tmp[3]: %d\n", tmp[0 * 4 + i], tmp[1 * 4 + i], tmp[2 * 4 + i], tmp[3 * 4 + i]); + } // iterates over each row of the 4x4 block (phase 2) for (int i = 0; i < 4; i++) { - + int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3]; int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3]; - + //printf("AFTER: s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, d23); // The DCT coefficients are scaled by adding 1 and then right-shifting // by 1 (equivalent to integer division by 2) for rounding. d[i * 4 + 0] = (s01 + s23 + 1) >> 1; d[i * 4 + 1] = (s01 - s23 + 1) >> 1; d[i * 4 + 2] = (d01 - d23 + 1) >> 1; d[i * 4 + 3] = (d01 + d23 + 1) >> 1; + } } @@ -52,7 +55,7 @@ void print_int32x4(const char* label, int32x4_t vector) { } // NEON version -static void dct4x4dc_neon(int *d) { +static void dct4x4dc_neon_real(int *d) { // hold the intermediate results int tmp[16]; int32x4_t one_vector = vdupq_n_s32(1); @@ -123,6 +126,65 @@ static void dct4x4dc_neon(int *d) { } } + +static void dct4x4dc_neon(int *d) { + int32x4x4_t input; + + input = vld4q_s32(d); + + int32x4_t result_add_s01 = vaddq_s32(input.val[0], input.val[1]); + int32x4_t result_add_s23 = vaddq_s32(input.val[2], input.val[3]); + int32x4_t result_sub_d01 = vsubq_s32(input.val[0], input.val[1]); + int32x4_t result_sub_d23 = vsubq_s32(input.val[2], input.val[3]); + + input.val[0] = vaddq_s32(result_add_s01, result_add_s23); + input.val[1] = vsubq_s32(result_add_s01, result_add_s23); + input.val[2] = vsubq_s32(result_sub_d01, result_sub_d23); + input.val[3] = vaddq_s32(result_sub_d01, result_sub_d23); + + int32x4x4_t input_transposed; + + input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 0), input_transposed.val[0], 0); + input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 0), input_transposed.val[0], 1); + input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 0), input_transposed.val[0], 2); + input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 0), input_transposed.val[0], 3); + + input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 1), input_transposed.val[1], 0); + input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 1), input_transposed.val[1], 1); + input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 1), input_transposed.val[1], 2); + input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 1), input_transposed.val[1], 3); + + input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 2), input_transposed.val[2], 0); + input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 2), input_transposed.val[2], 1); + input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 2), input_transposed.val[2], 2); + input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 2), input_transposed.val[2], 3); + + input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 3), input_transposed.val[3], 0); + input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 3), input_transposed.val[3], 1); + input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 3), input_transposed.val[3], 2); + input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 3), input_transposed.val[3], 3); + + int32x4_t result_add_s01_after = vaddq_s32(input_transposed.val[0], input_transposed.val[1]); + int32x4_t result_add_s23_after = vaddq_s32(input_transposed.val[2], input_transposed.val[3]); + int32x4_t result_sub_d01_after = vsubq_s32(input_transposed.val[0], input_transposed.val[1]); + int32x4_t result_sub_d23_after = vsubq_s32(input_transposed.val[2], input_transposed.val[3]); + + int32x4_t result_add_all_tmp0_after = vaddq_s32(result_add_s01_after, result_add_s23_after); + int32x4_t result_sub_all_tmp1_after = vsubq_s32(result_add_s01_after, result_add_s23_after); + int32x4_t result_sub_all_tmp2_after = vsubq_s32(result_sub_d01_after, result_sub_d23_after); + int32x4_t result_add_all_tmp3_after = vaddq_s32(result_sub_d01_after, result_sub_d23_after); + + int32x4_t one_vector = vdupq_n_s32(1); + input.val[0] = vshrq_n_s32(vaddq_s32(result_add_all_tmp0_after, one_vector), 1); + input.val[1] = vshrq_n_s32(vaddq_s32(result_sub_all_tmp1_after, one_vector), 1); + input.val[2] = vshrq_n_s32(vaddq_s32(result_sub_all_tmp2_after, one_vector), 1); + input.val[3] = vshrq_n_s32(vaddq_s32(result_add_all_tmp3_after, one_vector), 1); + + vst4q_s32(d, input); +} + + + int main(int argc, char **argv) { // handle user's arguement From 7e221401d7a1b8fa909fa909718d636447f7e799 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Thu, 28 Mar 2024 15:06:38 +0200 Subject: [PATCH 6/9] changed the way I tranpose the input --- dct_neon.c | 153 ++++++++++++++++------------------------------------- 1 file changed, 46 insertions(+), 107 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 608660c..79a1086 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -54,132 +54,71 @@ void print_int32x4(const char* label, int32x4_t vector) { printf("%s: [%d %d %d %d]\n", label, data[0], data[1], data[2], data[3]); } -// NEON version -static void dct4x4dc_neon_real(int *d) { - // hold the intermediate results - int tmp[16]; - int32x4_t one_vector = vdupq_n_s32(1); - - // iterate over each row of the 4x4 block (phase 1) - for (int i = 0; i < 4; i++) { - - //[a b c d] - int32x4_t row1 = vld1q_s32(&d[i * 4]); - - //[b c d a] by shuffling the first row - int32x4_t shuffled_row = vextq_s32(row1, row1, 1); - - //[s01 rand s23 rand] - int32x4_t result_add = vaddq_s32(row1, shuffled_row); - //[s23 rand s01 rand] - int32x4_t result_add_shuff = vextq_s32(result_add, result_add, 2); - - - //[d01 rand d23 rand] - int32x4_t result_sub = vsubq_s32(row1, shuffled_row); - //[d23 rand d01 rand] - int32x4_t result_sub_shuff = vextq_s32(result_sub, result_sub, 2); - - int32x4_t tmp0 =vaddq_s32(result_add, result_add_shuff); - int32x4_t tmp1 =vsubq_s32(result_add, result_add_shuff); - int32x4_t tmp2 =vsubq_s32(result_sub, result_sub_shuff); - int32x4_t tmp3 =vaddq_s32(result_sub, result_sub_shuff); - - tmp[0 * 4 + i] = vgetq_lane_s32(tmp0, 0); - tmp[1 * 4 + i] = vgetq_lane_s32(tmp1, 0); - tmp[2 * 4 + i] = vgetq_lane_s32(tmp2, 0); - tmp[3 * 4 + i] = vgetq_lane_s32(tmp3, 0); - - } - - // iterates over each row of the 4x4 block (phase 2) - for (int i = 0; i < 4; i++) { - - int32x4_t d_vector = vdupq_n_s32(0); - - //[tmp0 tmp1 tmp2 tmp3] - int32x4_t row1 = vld1q_s32(&tmp[i * 4]); - - //[tmp1 tmp2 tmp3 tmp0] by shuffling the first row - int32x4_t shuffled_row = vextq_s32(row1, row1, 1); - - //element 1 = s01, element 3 = s23 - int32x4_t result_add = vaddq_s32(row1, shuffled_row); - int32x4_t result_add_shuff = vextq_s32(result_add, result_add, 2); - - //element 1 = d01, element 3 = d23 - int32x4_t result_sub = vsubq_s32(row1, shuffled_row); - int32x4_t result_sub_shuff = vextq_s32(result_sub, result_sub, 2); - - int32x4_t tmp0 =vaddq_s32(result_add, result_add_shuff); - int32x4_t tmp1 =vsubq_s32(result_add, result_add_shuff); - int32x4_t tmp2 =vsubq_s32(result_sub, result_sub_shuff); - int32x4_t tmp3 =vaddq_s32(result_sub, result_sub_shuff); - - d_vector= vsetq_lane_s32(vgetq_lane_s32(tmp0, 0),d_vector,0); - d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp1, 0),d_vector,1); - d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp2, 0),d_vector,2); - d_vector=vsetq_lane_s32(vgetq_lane_s32(tmp3, 0),d_vector,3); - - d_vector=vshrq_n_s32(vaddq_s32(d_vector, one_vector),1); - vst1q_s32(&d[i * 4], d_vector); - } -} static void dct4x4dc_neon(int *d) { int32x4x4_t input; + //PHASE 1--------------------------------------------------------------------------- input = vld4q_s32(d); + //all s01 int32x4_t result_add_s01 = vaddq_s32(input.val[0], input.val[1]); + //all s23 int32x4_t result_add_s23 = vaddq_s32(input.val[2], input.val[3]); + //all d01 int32x4_t result_sub_d01 = vsubq_s32(input.val[0], input.val[1]); + //all d23 int32x4_t result_sub_d23 = vsubq_s32(input.val[2], input.val[3]); + //s01+s23 all input.val[0] = vaddq_s32(result_add_s01, result_add_s23); + //s01-s23 all input.val[1] = vsubq_s32(result_add_s01, result_add_s23); + //d01-d23 all input.val[2] = vsubq_s32(result_sub_d01, result_sub_d23); + //d01+d23 all input.val[3] = vaddq_s32(result_sub_d01, result_sub_d23); - int32x4x4_t input_transposed; - - input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 0), input_transposed.val[0], 0); - input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 0), input_transposed.val[0], 1); - input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 0), input_transposed.val[0], 2); - input_transposed.val[0] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 0), input_transposed.val[0], 3); - - input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 1), input_transposed.val[1], 0); - input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 1), input_transposed.val[1], 1); - input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 1), input_transposed.val[1], 2); - input_transposed.val[1] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 1), input_transposed.val[1], 3); - - input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 2), input_transposed.val[2], 0); - input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 2), input_transposed.val[2], 1); - input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 2), input_transposed.val[2], 2); - input_transposed.val[2] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 2), input_transposed.val[2], 3); - - input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[0], 3), input_transposed.val[3], 0); - input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[1], 3), input_transposed.val[3], 1); - input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[2], 3), input_transposed.val[3], 2); - input_transposed.val[3] = vsetq_lane_s32(vgetq_lane_s32(input.val[3], 3), input_transposed.val[3], 3); - - int32x4_t result_add_s01_after = vaddq_s32(input_transposed.val[0], input_transposed.val[1]); - int32x4_t result_add_s23_after = vaddq_s32(input_transposed.val[2], input_transposed.val[3]); - int32x4_t result_sub_d01_after = vsubq_s32(input_transposed.val[0], input_transposed.val[1]); - int32x4_t result_sub_d23_after = vsubq_s32(input_transposed.val[2], input_transposed.val[3]); - - int32x4_t result_add_all_tmp0_after = vaddq_s32(result_add_s01_after, result_add_s23_after); - int32x4_t result_sub_all_tmp1_after = vsubq_s32(result_add_s01_after, result_add_s23_after); - int32x4_t result_sub_all_tmp2_after = vsubq_s32(result_sub_d01_after, result_sub_d23_after); - int32x4_t result_add_all_tmp3_after = vaddq_s32(result_sub_d01_after, result_sub_d23_after); + + //BEFORE GOING TO PHASE 2, I NEED TO TRANPOSE + int32x4x2_t temp_transposed1 = vtrnq_s32(input.val[0], input.val[1]); + int32x4x2_t temp_transposed2 = vtrnq_s32(input.val[2], input.val[3]); + + input.val[0] = vcombine_s32(vget_low_s32(temp_transposed1.val[0]), vget_low_s32(temp_transposed2.val[0])); + input.val[1] = vcombine_s32(vget_low_s32(temp_transposed1.val[1]), vget_low_s32(temp_transposed2.val[1])); + input.val[2] = vcombine_s32(vget_high_s32(temp_transposed1.val[0]), vget_high_s32(temp_transposed2.val[0])); + input.val[3] = vcombine_s32(vget_high_s32(temp_transposed1.val[1]), vget_high_s32(temp_transposed2.val[1])); + + + //PHASE 2--------------------------------------------------------------------------- + //all s01 after + result_add_s01 = vaddq_s32(input.val[0], input.val[1]); + //all s23 after + result_add_s23 = vaddq_s32(input.val[2], input.val[3]); + //all d01 after + result_sub_d01 = vsubq_s32(input.val[0], input.val[1]); + //all d23 after + result_sub_d23 = vsubq_s32(input.val[2], input.val[3]); + + //s01+s23 all after + input.val[0] = vaddq_s32(result_add_s01, result_add_s23); + //s01-s23 all after + input.val[1] = vsubq_s32(result_add_s01, result_add_s23); + //d01-d23 all after + input.val[2] = vsubq_s32(result_sub_d01, result_sub_d23); + //d01+d23 all after + input.val[3] = vaddq_s32(result_sub_d01, result_sub_d23); + //+1 and shift int32x4_t one_vector = vdupq_n_s32(1); - input.val[0] = vshrq_n_s32(vaddq_s32(result_add_all_tmp0_after, one_vector), 1); - input.val[1] = vshrq_n_s32(vaddq_s32(result_sub_all_tmp1_after, one_vector), 1); - input.val[2] = vshrq_n_s32(vaddq_s32(result_sub_all_tmp2_after, one_vector), 1); - input.val[3] = vshrq_n_s32(vaddq_s32(result_add_all_tmp3_after, one_vector), 1); + + input.val[0] = vshrq_n_s32(vaddq_s32(input.val[0], one_vector), 1); + input.val[1] = vshrq_n_s32(vaddq_s32(input.val[1], one_vector), 1); + input.val[2] = vshrq_n_s32(vaddq_s32(input.val[2], one_vector), 1); + input.val[3] = vshrq_n_s32(vaddq_s32(input.val[3], one_vector), 1); + //store back vst4q_s32(d, input); } @@ -215,7 +154,7 @@ int main(int argc, char **argv) { // initialize original matrix d for (int i = 0; i < 16; i++) { - random_value[i] = rand() % 256; // integers between 0-255 + random_value[i] = rand() & 0xFF; } // call SCALAR function From 83bcec98d36eb02515b833092932879580f9aa5c Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Thu, 28 Mar 2024 20:57:45 +0200 Subject: [PATCH 7/9] changed the way I load and store the input/results --- dct_neon.c | 72 +++++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 79a1086..96bb991 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -21,7 +21,7 @@ static void dct4x4dc_c(int d[16]) { int d01 = d[i * 4 + 0] - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row int s23 = d[i * 4 + 2] + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row int d23 = d[i * 4 + 2] - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row - //printf("s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, d23); + //printf("s01: %d, s23: %d, d01: %d, d23: %d\n", d[i * 4 + 0], d[i * 4 + 1], d[i * 4 + 2], d[i * 4 + 3]); tmp[0 * 4 + i] = s01 + s23; // 1st element of the row tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row tmp[2 * 4 + i] = d01 - d23; // 3d element of the row @@ -57,69 +57,75 @@ void print_int32x4(const char* label, int32x4_t vector) { static void dct4x4dc_neon(int *d) { - int32x4x4_t input; - - //PHASE 1--------------------------------------------------------------------------- - input = vld4q_s32(d); + + int32x4_t input0 = vld1q_s32(d); + int32x4_t input1 = vld1q_s32(d + 4); + int32x4_t input2 = vld1q_s32(d + 8); + int32x4_t input3 = vld1q_s32(d + 12); + //all s01 - int32x4_t result_add_s01 = vaddq_s32(input.val[0], input.val[1]); + int32x4_t result_add_s01 = vaddq_s32(input0, input1); //all s23 - int32x4_t result_add_s23 = vaddq_s32(input.val[2], input.val[3]); + int32x4_t result_add_s23 = vaddq_s32(input2, input3); //all d01 - int32x4_t result_sub_d01 = vsubq_s32(input.val[0], input.val[1]); + int32x4_t result_sub_d01 = vsubq_s32(input0, input1); //all d23 - int32x4_t result_sub_d23 = vsubq_s32(input.val[2], input.val[3]); + int32x4_t result_sub_d23 = vsubq_s32(input2, input3); //s01+s23 all - input.val[0] = vaddq_s32(result_add_s01, result_add_s23); + input0 = vaddq_s32(result_add_s01, result_add_s23); //s01-s23 all - input.val[1] = vsubq_s32(result_add_s01, result_add_s23); + input1 = vsubq_s32(result_add_s01, result_add_s23); //d01-d23 all - input.val[2] = vsubq_s32(result_sub_d01, result_sub_d23); + input2 = vsubq_s32(result_sub_d01, result_sub_d23); //d01+d23 all - input.val[3] = vaddq_s32(result_sub_d01, result_sub_d23); + input3 = vaddq_s32(result_sub_d01, result_sub_d23); + - //BEFORE GOING TO PHASE 2, I NEED TO TRANPOSE - int32x4x2_t temp_transposed1 = vtrnq_s32(input.val[0], input.val[1]); - int32x4x2_t temp_transposed2 = vtrnq_s32(input.val[2], input.val[3]); + int32x4x2_t temp_transposed1 = vtrnq_s32(input0, input1); + int32x4x2_t temp_transposed2 = vtrnq_s32(input2, input3); - input.val[0] = vcombine_s32(vget_low_s32(temp_transposed1.val[0]), vget_low_s32(temp_transposed2.val[0])); - input.val[1] = vcombine_s32(vget_low_s32(temp_transposed1.val[1]), vget_low_s32(temp_transposed2.val[1])); - input.val[2] = vcombine_s32(vget_high_s32(temp_transposed1.val[0]), vget_high_s32(temp_transposed2.val[0])); - input.val[3] = vcombine_s32(vget_high_s32(temp_transposed1.val[1]), vget_high_s32(temp_transposed2.val[1])); + input0 = vcombine_s32(vget_low_s32(temp_transposed1.val[0]), vget_low_s32(temp_transposed2.val[0])); + input1 = vcombine_s32(vget_low_s32(temp_transposed1.val[1]), vget_low_s32(temp_transposed2.val[1])); + input2 = vcombine_s32(vget_high_s32(temp_transposed1.val[0]), vget_high_s32(temp_transposed2.val[0])); + input3 = vcombine_s32(vget_high_s32(temp_transposed1.val[1]), vget_high_s32(temp_transposed2.val[1])); //PHASE 2--------------------------------------------------------------------------- //all s01 after - result_add_s01 = vaddq_s32(input.val[0], input.val[1]); + result_add_s01 = vaddq_s32(input0, input1); //all s23 after - result_add_s23 = vaddq_s32(input.val[2], input.val[3]); + result_add_s23 = vaddq_s32(input2, input3); //all d01 after - result_sub_d01 = vsubq_s32(input.val[0], input.val[1]); + result_sub_d01 = vsubq_s32(input0, input1); //all d23 after - result_sub_d23 = vsubq_s32(input.val[2], input.val[3]); + result_sub_d23 = vsubq_s32(input2, input3); //s01+s23 all after - input.val[0] = vaddq_s32(result_add_s01, result_add_s23); + input0 = vaddq_s32(result_add_s01, result_add_s23); //s01-s23 all after - input.val[1] = vsubq_s32(result_add_s01, result_add_s23); + input1 = vsubq_s32(result_add_s01, result_add_s23); //d01-d23 all after - input.val[2] = vsubq_s32(result_sub_d01, result_sub_d23); + input2 = vsubq_s32(result_sub_d01, result_sub_d23); //d01+d23 all after - input.val[3] = vaddq_s32(result_sub_d01, result_sub_d23); + input3 = vaddq_s32(result_sub_d01, result_sub_d23); //+1 and shift int32x4_t one_vector = vdupq_n_s32(1); - input.val[0] = vshrq_n_s32(vaddq_s32(input.val[0], one_vector), 1); - input.val[1] = vshrq_n_s32(vaddq_s32(input.val[1], one_vector), 1); - input.val[2] = vshrq_n_s32(vaddq_s32(input.val[2], one_vector), 1); - input.val[3] = vshrq_n_s32(vaddq_s32(input.val[3], one_vector), 1); + input0 = vshrq_n_s32(vaddq_s32(input0, one_vector), 1); + input1 = vshrq_n_s32(vaddq_s32(input1, one_vector), 1); + input2 = vshrq_n_s32(vaddq_s32(input2, one_vector), 1); + input3 = vshrq_n_s32(vaddq_s32(input3, one_vector), 1); //store back - vst4q_s32(d, input); + vst1q_s32(d, input0); + vst1q_s32(d + 4, input1); + vst1q_s32(d + 8, input2); + vst1q_s32(d + 12, input3); + } From 78a23771ca37cb8caccd55fb60fee93a53619707 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Fri, 29 Mar 2024 12:24:59 +0200 Subject: [PATCH 8/9] changed the tranpose, did clang format --- dct_neon.c | 178 +++++++++++++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 88 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index 96bb991..d8a5135 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -17,119 +17,121 @@ static void dct4x4dc_c(int d[16]) { // iterate over each row of the 4x4 block (phase 1) for (int i = 0; i < 4; i++) { - int s01 = d[i * 4 + 0] + d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row - int d01 = d[i * 4 + 0] - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row - int s23 = d[i * 4 + 2] + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row - int d23 = d[i * 4 + 2] - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row - //printf("s01: %d, s23: %d, d01: %d, d23: %d\n", d[i * 4 + 0], d[i * 4 + 1], d[i * 4 + 2], d[i * 4 + 3]); + int s01 = d[i * 4 + 0] + + d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row + int d01 = + d[i * 4 + 0] - + d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row + int s23 = d[i * 4 + 2] + + d[i * 4 + 3]; // sum of the 3d and 4th elements in the row + int d23 = + d[i * 4 + 2] - + d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row + // printf("s01: %d, s23: %d, d01: %d, d23: %d\n", d[i * 4 + 0], d[i * 4 + + // 1], d[i * 4 + 2], d[i * 4 + 3]); tmp[0 * 4 + i] = s01 + s23; // 1st element of the row tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row tmp[2 * 4 + i] = d01 - d23; // 3d element of the row tmp[3 * 4 + i] = d01 + d23; // 4th element of the row - //printf("tmp[0]: %d, tmp[1]: %d, tmp[2]: %d, tmp[3]: %d\n", tmp[0 * 4 + i], tmp[1 * 4 + i], tmp[2 * 4 + i], tmp[3 * 4 + i]); - + // printf("tmp[0]: %d, tmp[1]: %d, tmp[2]: %d, tmp[3]: %d\n", tmp[0 * 4 + + // i], tmp[1 * 4 + i], tmp[2 * 4 + i], tmp[3 * 4 + i]); } // iterates over each row of the 4x4 block (phase 2) for (int i = 0; i < 4; i++) { - + int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3]; int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3]; - //printf("AFTER: s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, d23); - // The DCT coefficients are scaled by adding 1 and then right-shifting - // by 1 (equivalent to integer division by 2) for rounding. + // printf("AFTER: s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, + // d23); + // The DCT coefficients are scaled by adding 1 and then right-shifting + // by 1 (equivalent to integer division by 2) for rounding. d[i * 4 + 0] = (s01 + s23 + 1) >> 1; d[i * 4 + 1] = (s01 - s23 + 1) >> 1; d[i * 4 + 2] = (d01 - d23 + 1) >> 1; d[i * 4 + 3] = (d01 + d23 + 1) >> 1; - } } -void print_int32x4(const char* label, int32x4_t vector) { - int32_t data[4]; - vst1q_s32(data, vector); - printf("%s: [%d %d %d %d]\n", label, data[0], data[1], data[2], data[3]); +void print_int32x4(const char *label, int32x4_t vector) { + int32_t data[4]; + vst1q_s32(data, vector); + printf("%s: [%d %d %d %d]\n", label, data[0], data[1], data[2], data[3]); } - - static void dct4x4dc_neon(int *d) { - - int32x4_t input0 = vld1q_s32(d); - int32x4_t input1 = vld1q_s32(d + 4); - int32x4_t input2 = vld1q_s32(d + 8); - int32x4_t input3 = vld1q_s32(d + 12); - - - //all s01 - int32x4_t result_add_s01 = vaddq_s32(input0, input1); - //all s23 - int32x4_t result_add_s23 = vaddq_s32(input2, input3); - //all d01 - int32x4_t result_sub_d01 = vsubq_s32(input0, input1); - //all d23 - int32x4_t result_sub_d23 = vsubq_s32(input2, input3); - - //s01+s23 all - input0 = vaddq_s32(result_add_s01, result_add_s23); - //s01-s23 all - input1 = vsubq_s32(result_add_s01, result_add_s23); - //d01-d23 all - input2 = vsubq_s32(result_sub_d01, result_sub_d23); - //d01+d23 all - input3 = vaddq_s32(result_sub_d01, result_sub_d23); - - - //BEFORE GOING TO PHASE 2, I NEED TO TRANPOSE - int32x4x2_t temp_transposed1 = vtrnq_s32(input0, input1); - int32x4x2_t temp_transposed2 = vtrnq_s32(input2, input3); - - input0 = vcombine_s32(vget_low_s32(temp_transposed1.val[0]), vget_low_s32(temp_transposed2.val[0])); - input1 = vcombine_s32(vget_low_s32(temp_transposed1.val[1]), vget_low_s32(temp_transposed2.val[1])); - input2 = vcombine_s32(vget_high_s32(temp_transposed1.val[0]), vget_high_s32(temp_transposed2.val[0])); - input3 = vcombine_s32(vget_high_s32(temp_transposed1.val[1]), vget_high_s32(temp_transposed2.val[1])); - - - //PHASE 2--------------------------------------------------------------------------- - //all s01 after - result_add_s01 = vaddq_s32(input0, input1); - //all s23 after - result_add_s23 = vaddq_s32(input2, input3); - //all d01 after - result_sub_d01 = vsubq_s32(input0, input1); - //all d23 after - result_sub_d23 = vsubq_s32(input2, input3); - - //s01+s23 all after - input0 = vaddq_s32(result_add_s01, result_add_s23); - //s01-s23 all after - input1 = vsubq_s32(result_add_s01, result_add_s23); - //d01-d23 all after - input2 = vsubq_s32(result_sub_d01, result_sub_d23); - //d01+d23 all after - input3 = vaddq_s32(result_sub_d01, result_sub_d23); - - //+1 and shift - int32x4_t one_vector = vdupq_n_s32(1); - - input0 = vshrq_n_s32(vaddq_s32(input0, one_vector), 1); - input1 = vshrq_n_s32(vaddq_s32(input1, one_vector), 1); - input2 = vshrq_n_s32(vaddq_s32(input2, one_vector), 1); - input3 = vshrq_n_s32(vaddq_s32(input3, one_vector), 1); - - //store back - vst1q_s32(d, input0); - vst1q_s32(d + 4, input1); - vst1q_s32(d + 8, input2); - vst1q_s32(d + 12, input3); + int32x4_t input0 = vld1q_s32(d); + int32x4_t input1 = vld1q_s32(d + 4); + int32x4_t input2 = vld1q_s32(d + 8); + int32x4_t input3 = vld1q_s32(d + 12); + + // all s01 + int32x4_t result_add_s01 = vaddq_s32(input0, input1); + // all s23 + int32x4_t result_add_s23 = vaddq_s32(input2, input3); + // all d01 + int32x4_t result_sub_d01 = vsubq_s32(input0, input1); + // all d23 + int32x4_t result_sub_d23 = vsubq_s32(input2, input3); + + // s01+s23 all + input0 = vaddq_s32(result_add_s01, result_add_s23); + // s01-s23 all + input1 = vsubq_s32(result_add_s01, result_add_s23); + // d01-d23 all + input2 = vsubq_s32(result_sub_d01, result_sub_d23); + // d01+d23 all + input3 = vaddq_s32(result_sub_d01, result_sub_d23); + + // BEFORE GOING TO PHASE 2, I NEED TO TRANPOSE + int32x4_t temp_trans0 = vtrn1q_s32(input0, input1); + int32x4_t temp_trans1 = vtrn2q_s32(input0, input1); + int32x4_t temp_trans2 = vtrn1q_s32(input2, input3); + int32x4_t temp_trans3 = vtrn2q_s32(input2, input3); + + input0 = vcombine_s32(vget_low_s32(temp_trans0), vget_low_s32(temp_trans2)); + input1 = vcombine_s32(vget_low_s32(temp_trans1), vget_low_s32(temp_trans3)); + input2 = vcombine_s32(vget_high_s32(temp_trans0), vget_high_s32(temp_trans2)); + input3 = vcombine_s32(vget_high_s32(temp_trans1), vget_high_s32(temp_trans3)); + + // PHASE + // 2--------------------------------------------------------------------------- + // all s01 after + result_add_s01 = vaddq_s32(input0, input1); + // all s23 after + result_add_s23 = vaddq_s32(input2, input3); + // all d01 after + result_sub_d01 = vsubq_s32(input0, input1); + // all d23 after + result_sub_d23 = vsubq_s32(input2, input3); + + // s01+s23 all after + input0 = vaddq_s32(result_add_s01, result_add_s23); + // s01-s23 all after + input1 = vsubq_s32(result_add_s01, result_add_s23); + // d01-d23 all after + input2 = vsubq_s32(result_sub_d01, result_sub_d23); + // d01+d23 all after + input3 = vaddq_s32(result_sub_d01, result_sub_d23); + + //+1 and shift + int32x4_t one_vector = vdupq_n_s32(1); + + input0 = vshrq_n_s32(vaddq_s32(input0, one_vector), 1); + input1 = vshrq_n_s32(vaddq_s32(input1, one_vector), 1); + input2 = vshrq_n_s32(vaddq_s32(input2, one_vector), 1); + input3 = vshrq_n_s32(vaddq_s32(input3, one_vector), 1); + + // store back + vst1q_s32(d, input0); + vst1q_s32(d + 4, input1); + vst1q_s32(d + 8, input2); + vst1q_s32(d + 12, input3); } - - int main(int argc, char **argv) { // handle user's arguement From 56e7fd3acd6e03497d16a5bc33843c58582b1915 Mon Sep 17 00:00:00 2001 From: Giorgos Mermigkis Date: Fri, 29 Mar 2024 15:50:36 +0200 Subject: [PATCH 9/9] implemented the same function for uint16 --- dct_neon.c | 91 ++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/dct_neon.c b/dct_neon.c index d8a5135..a0d6c2d 100644 --- a/dct_neon.c +++ b/dct_neon.c @@ -11,43 +11,30 @@ // into d // REF: // https://code.videolan.org/videolan/x264/-/blob/master/common/dct.c?ref_type=heads -static void dct4x4dc_c(int d[16]) { +static void dct4x4dc_c(uint16_t d[16]) { // hold the intermediate results int tmp[16]; // iterate over each row of the 4x4 block (phase 1) for (int i = 0; i < 4; i++) { - int s01 = d[i * 4 + 0] + - d[i * 4 + 1]; // sum of the 1st and 2nd elements in the row - int d01 = - d[i * 4 + 0] - - d[i * 4 + 1]; // diff between the 1st and the 2nd elements in the row - int s23 = d[i * 4 + 2] + - d[i * 4 + 3]; // sum of the 3d and 4th elements in the row - int d23 = - d[i * 4 + 2] - - d[i * 4 + 3]; // diff between the 3d and the 4th elements in the row - // printf("s01: %d, s23: %d, d01: %d, d23: %d\n", d[i * 4 + 0], d[i * 4 + - // 1], d[i * 4 + 2], d[i * 4 + 3]); - tmp[0 * 4 + i] = s01 + s23; // 1st element of the row - tmp[1 * 4 + i] = s01 - s23; // 2nd element of the row - tmp[2 * 4 + i] = d01 - d23; // 3d element of the row - tmp[3 * 4 + i] = d01 + d23; // 4th element of the row - // printf("tmp[0]: %d, tmp[1]: %d, tmp[2]: %d, tmp[3]: %d\n", tmp[0 * 4 + - // i], tmp[1 * 4 + i], tmp[2 * 4 + i], tmp[3 * 4 + i]); + + int s01 = d[i * 4 + 0] + d[i * 4 + 1]; + int d01 = d[i * 4 + 0] - d[i * 4 + 1]; + int s23 = d[i * 4 + 2] + d[i * 4 + 3]; + int d23 = d[i * 4 + 2] - d[i * 4 + 3]; + tmp[0 * 4 + i] = s01 + s23; + tmp[1 * 4 + i] = s01 - s23; + tmp[2 * 4 + i] = d01 - d23; + tmp[3 * 4 + i] = d01 + d23; } // iterates over each row of the 4x4 block (phase 2) for (int i = 0; i < 4; i++) { - int s01 = tmp[i * 4 + 0] + tmp[i * 4 + 1]; int d01 = tmp[i * 4 + 0] - tmp[i * 4 + 1]; int s23 = tmp[i * 4 + 2] + tmp[i * 4 + 3]; int d23 = tmp[i * 4 + 2] - tmp[i * 4 + 3]; - // printf("AFTER: s01: %d, s23: %d, d01: %d, d23: %d\n", s01, s23, d01, - // d23); - // The DCT coefficients are scaled by adding 1 and then right-shifting - // by 1 (equivalent to integer division by 2) for rounding. + d[i * 4 + 0] = (s01 + s23 + 1) >> 1; d[i * 4 + 1] = (s01 - s23 + 1) >> 1; d[i * 4 + 2] = (d01 - d23 + 1) >> 1; @@ -55,20 +42,32 @@ static void dct4x4dc_c(int d[16]) { } } +void print_uint16x4(const char *label, uint16x4_t vector) { + uint16_t data[4]; + vst1_u16(data, vector); + printf("%s: [%hu %hu %hu %hu]\n", label, data[0], data[1], data[2], data[3]); +} + void print_int32x4(const char *label, int32x4_t vector) { int32_t data[4]; vst1q_s32(data, vector); printf("%s: [%d %d %d %d]\n", label, data[0], data[1], data[2], data[3]); } -static void dct4x4dc_neon(int *d) { +static void dct4x4dc_neon(uint16_t *d) { + + uint16x4_t input0_low = vld1_u16(d); + uint16x4_t input1_low = vld1_u16(d + 4); + uint16x4_t input2_low = vld1_u16(d + 8); + uint16x4_t input3_low = vld1_u16(d + 12); - int32x4_t input0 = vld1q_s32(d); - int32x4_t input1 = vld1q_s32(d + 4); - int32x4_t input2 = vld1q_s32(d + 8); - int32x4_t input3 = vld1q_s32(d + 12); + int32x4_t input0 = vreinterpretq_s32_u32(vmovl_u16(input0_low)); + int32x4_t input1 = vreinterpretq_s32_u32(vmovl_u16(input1_low)); + int32x4_t input2 = vreinterpretq_s32_u32(vmovl_u16(input2_low)); + int32x4_t input3 = vreinterpretq_s32_u32(vmovl_u16(input3_low)); - // all s01 + // PHASE 1 + // all s01 int32x4_t result_add_s01 = vaddq_s32(input0, input1); // all s23 int32x4_t result_add_s23 = vaddq_s32(input2, input3); @@ -97,9 +96,8 @@ static void dct4x4dc_neon(int *d) { input2 = vcombine_s32(vget_high_s32(temp_trans0), vget_high_s32(temp_trans2)); input3 = vcombine_s32(vget_high_s32(temp_trans1), vget_high_s32(temp_trans3)); - // PHASE - // 2--------------------------------------------------------------------------- - // all s01 after + // PHASE 2 + // all s01 after result_add_s01 = vaddq_s32(input0, input1); // all s23 after result_add_s23 = vaddq_s32(input2, input3); @@ -117,7 +115,6 @@ static void dct4x4dc_neon(int *d) { // d01+d23 all after input3 = vaddq_s32(result_sub_d01, result_sub_d23); - //+1 and shift int32x4_t one_vector = vdupq_n_s32(1); input0 = vshrq_n_s32(vaddq_s32(input0, one_vector), 1); @@ -125,16 +122,16 @@ static void dct4x4dc_neon(int *d) { input2 = vshrq_n_s32(vaddq_s32(input2, one_vector), 1); input3 = vshrq_n_s32(vaddq_s32(input3, one_vector), 1); - // store back - vst1q_s32(d, input0); - vst1q_s32(d + 4, input1); - vst1q_s32(d + 8, input2); - vst1q_s32(d + 12, input3); + // Store the results back to the memory + vst1_u16(d, vmovn_u32(vreinterpretq_u32_s32(input0))); + vst1_u16(d + 4, vmovn_u32(vreinterpretq_u32_s32(input1))); + vst1_u16(d + 8, vmovn_u32(vreinterpretq_u32_s32(input2))); + vst1_u16(d + 12, vmovn_u32(vreinterpretq_u32_s32(input3))); } int main(int argc, char **argv) { - // handle user's arguement + // handle user's argument long int LOOPS = 10000000000; if (argc == 2) { @@ -152,13 +149,13 @@ int main(int argc, char **argv) { srand(time(NULL)); struct timeval tv1, tv2, tv3, tv4, diff1, diff2; - int d[16]; - int *dd = NULL; - if (posix_memalign((void **)&dd, 16, 16 * sizeof(int)) != 0) { + uint16_t d[16]; + uint16_t *dd = NULL; + if (posix_memalign((void **)&dd, 16, 16 * sizeof(uint16_t)) != 0) { perror("posix_memalign failed"); exit(EXIT_FAILURE); } - int random_value[16]; + uint16_t random_value[16]; // initialize original matrix d for (int i = 0; i < 16; i++) { @@ -178,7 +175,7 @@ int main(int argc, char **argv) { // print the transformed matrix printf("Transformed Matrix (dct) from Scalar function:\n"); for (int i = 0; i < 16; i++) { - printf("%3d ", d[i]); + printf("%5d ", d[i]); if ((i + 1) % 4 == 0) printf("\n"); } @@ -198,7 +195,7 @@ int main(int argc, char **argv) { // print the transformed matrix printf("Transformed Matrix (dct) from NEON function:\n"); for (int i = 0; i < 16; i++) { - printf("%3d ", dd[i]); + printf("%5d ", dd[i]); if ((i + 1) % 4 == 0) printf("\n"); } @@ -213,4 +210,4 @@ int main(int argc, char **argv) { printf("NEON DCT: %ld sec, usec: %d\n", diff2.tv_sec, diff2.tv_usec); return 0; -} \ No newline at end of file +}