diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 9320af9224..cfc8a893be 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1475,7 +1475,7 @@ endif () "#define L2_ASSOCIATIVE 4\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 8) - set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_N 8) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 8) diff --git a/getarch_2nd.c b/getarch_2nd.c index 2085556bd6..744ad5ee7d 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -103,7 +103,55 @@ int main(int argc, char **argv) { printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); #endif +#ifdef ARCH_RISCV64 +#ifdef STRMM_DEFAULT_UNROLL_M + printf("STRMM_UNROLL_M=%d\n", STRMM_DEFAULT_UNROLL_M); +#else + printf("STRMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef STRMM_DEFAULT_UNROLL_N + printf("STRMM_UNROLL_N=%d\n", STRMM_DEFAULT_UNROLL_N); +#else + printf("STRMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); +#endif + +#ifdef DTRMM_DEFAULT_UNROLL_M + printf("DTRMM_UNROLL_M=%d\n", DTRMM_DEFAULT_UNROLL_M); +#else + printf("DTRMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef DTRMM_DEFAULT_UNROLL_N + printf("DTRMM_UNROLL_N=%d\n", DTRMM_DEFAULT_UNROLL_N); +#else + printf("DTRMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); +#endif +#ifdef CTRMM_DEFAULT_UNROLL_M + printf("CTRMM_UNROLL_M=%d\n", CTRMM_DEFAULT_UNROLL_M); +#else + printf("CTRMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef CTRMM_DEFAULT_UNROLL_N + printf("CTRMM_UNROLL_N=%d\n", CTRMM_DEFAULT_UNROLL_N); +#else + printf("CTRMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); +#endif + +#ifdef ZTRMM_DEFAULT_UNROLL_M + printf("ZTRMM_UNROLL_M=%d\n", ZTRMM_DEFAULT_UNROLL_M); +#else + printf("ZTRMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); +#endif + +#ifdef ZTRMM_DEFAULT_UNROLL_N + printf("ZTRMM_UNROLL_N=%d\n", ZTRMM_DEFAULT_UNROLL_N); +#else + printf("ZTRMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); +#endif +#endif /* ARCH_RISCV64 */ } diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index ad7db5622e..038d58e8f5 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -95,10 +95,10 @@ DGEMVTKERNEL = gemv_t_rvv.c CGEMVTKERNEL = zgemv_t_rvv.c ZGEMVTKERNEL = zgemv_t_rvv.c -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c +SGEMMKERNEL = sgemm_kernel_zvl128b.c ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c -SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c @@ -109,7 +109,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c -SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c @@ -118,10 +118,10 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c +DGEMMKERNEL = dgemm_kernel_zvl128b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) -DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif @@ -131,7 +131,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) -DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif @@ -139,55 +139,55 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c +CGEMMKERNEL = cgemm_kernel_zvl128b.c CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c +ZGEMMKERNEL = zgemm_kernel_zvl128b.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c -STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c -STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c -STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c -STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c - -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c -DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c -DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c -DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c -DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c - -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c -CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c -CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c -CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c -CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c - -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c -ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c -ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c -ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c -ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c +STRMMKERNEL = strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl128b.c +STRMMUNCOPY_M = ../generic/trmm_uncopy_$(STRMM_UNROLL_M).c +STRMMLNCOPY_M = ../generic/trmm_lncopy_$(STRMM_UNROLL_M).c +STRMMUTCOPY_M = ../generic/trmm_utcopy_$(STRMM_UNROLL_M).c +STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(STRMM_UNROLL_M).c + +DTRMMKERNEL = dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl128b.c +DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DTRMM_UNROLL_M).c +DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DTRMM_UNROLL_M).c +DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DTRMM_UNROLL_M).c +DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DTRMM_UNROLL_M).c + +CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl128b.c +CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CTRMM_UNROLL_M).c +CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CTRMM_UNROLL_M).c +CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CTRMM_UNROLL_M).c +CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CTRMM_UNROLL_M).c + +ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl128b.c +ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZTRMM_UNROLL_M).c +ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZTRMM_UNROLL_M).c +ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZTRMM_UNROLL_M).c +ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZTRMM_UNROLL_M).c STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index c48095bb21..b95f7ab3e3 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -90,15 +90,15 @@ DGEMVTKERNEL = gemv_t_vector.c CGEMVTKERNEL = zgemv_t_vector.c ZGEMVTKERNEL = zgemv_t_vector.c -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +STRMMKERNEL = strmm_kernel_$(STRMM_UNROLL_M)x$(STRMM_UNROLL_N)_zvl256b.c +DTRMMKERNEL = dtrmm_kernel_$(DTRMM_UNROLL_M)x$(DTRMM_UNROLL_N)_zvl256b.c +CTRMMKERNEL = ctrmm_kernel_$(CTRMM_UNROLL_M)x$(CTRMM_UNROLL_N)_zvl256b.c +ZTRMMKERNEL = ztrmm_kernel_$(ZTRMM_UNROLL_M)x$(ZTRMM_UNROLL_N)_zvl256b.c -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c +SGEMMKERNEL = sgemm_kernel_zvl256b.c ifneq ($(filter $(SGEMM_UNROLL_N),4 8 16),) SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c -SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c +SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv_max.c else SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c @@ -108,7 +108,7 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(filter $(SGEMM_UNROLL_M),4 8 16),) SGEMMINCOPY = gemm_ncopy_$(SGEMM_UNROLL_M)_rvv.c -SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv.c +SGEMMITCOPY = gemm_tcopy_$(SGEMM_UNROLL_M)_rvv_max.c else SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c @@ -117,19 +117,20 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c +DGEMMKERNEL = dgemm_kernel_zvl256b.c DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c ifneq ($(filter $(DGEMM_UNROLL_N),4 8 16),) -DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c +DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv_max.c else DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c endif + DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c ifneq ($(filter $(DGEMM_UNROLL_M),4 8 16),) -DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv.c +DGEMMITCOPY = gemm_tcopy_$(DGEMM_UNROLL_M)_rvv_max.c else DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c endif @@ -137,28 +138,28 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c +CGEMMKERNEL = cgemm_kernel_zvl256b.c CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = zgemm_tcopy_$(CGEMM_UNROLL_N)_rvv_max.c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = zgemm_tcopy_$(CGEMM_UNROLL_M)_rvv_max.c CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c +ZGEMMKERNEL = zgemm_kernel_zvl256b.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_N)_rvv_max.c ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = zgemm_tcopy_$(ZGEMM_UNROLL_M)_rvv_max.c ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c deleted file mode 100644 index bd615389c8..0000000000 --- a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c +++ /dev/null @@ -1,996 +0,0 @@ -/* - -AUTOGENERATED KERNEL -Script: ./kernel/riscv64/generate_kernel.py -Settings: - LMUL=2 - M=8 - M_tail_scalar_from=2 - N=4 - __riscv_='__riscv_' - complex=True - conjugate=False - cpu='zvl128b' - force_acc_double=False - index_type='BLASLONG' - op='gemm' - param_precision='float' - reg_width_bits=128 - tail_policy='' - trace=False - -Derived: - ELEN_ACC=32 - ELEN_PARAM=32 - LMUL_ACC=2 - VFMACC='__riscv_vfmacc_vf_f32m2' - VFMUL='__riscv_vfmul_vf_f32m2' - VLEV='__riscv_vle32_v_f32m2' - VLSEV='__riscv_vlse32_v_f32m2' - VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' - VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' - VSETVL='__riscv_vsetvl_e32m2' - VSEV='__riscv_vse32_v_f32m2' - VSSEV='__riscv_vsse32_v_f32m2' - acc_vector_t='vfloat32m2_t' - output='cgemm_kernel_8x4_zvl128b.c' - param_scalar_t='float' - param_vector_t='vfloat32m2_t' - -*/ - -#include "common.h" - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define S0 1 -#define S1 -1 -#define S2 1 -#define S3 1 -#define VFMACC_RR __riscv_vfmsac -#define VFMACC_RI __riscv_vfmacc -#endif -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define S0 1 -#define S1 1 -#define S2 1 -#define S3 -1 -#define VFMACC_RR __riscv_vfmacc -#define VFMACC_RI __riscv_vfmsac -#endif -#if defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define S0 1 -#define S1 1 -#define S2 -1 -#define S3 1 -#define VFMACC_RR __riscv_vfmacc -#define VFMACC_RI __riscv_vfnmsac -#endif -#if defined(RR) || defined(RC) || defined(CR) || defined(CC) -#define S0 1 -#define S1 -1 -#define S2 -1 -#define S3 -1 -#define VFMACC_RR __riscv_vfmsac -#define VFMACC_RI __riscv_vfnmacc -#endif - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - // -- MAIN PASS - - for (BLASLONG j = 0; j < N / 4; j += 1) { - m_top = 0; - BLASLONG gvl = __riscv_vsetvl_e32m2(8); - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - float B2r = B[bi + 2 * 2 + 0]; - float B2i = B[bi + 2 * 2 + 1]; - float B3r = B[bi + 3 * 2 + 0]; - float B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k - // leaving 6 vector registers for temporaries - // performing 2 operations between reuses of temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - vfloat32m2_t ACC2r = tmp0r; - vfloat32m2_t ACC2i = tmp0i; - vfloat32m2_t ACC3r = tmp1r; - vfloat32m2_t ACC3i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - B2r = B[bi + 2 * 2 + 0]; - B2i = B[bi + 2 * 2 + 1]; - B3r = B[bi + 3 * 2 + 0]; - B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); - ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); - ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); - ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); - C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); - C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); - C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); - C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); - C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); - C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); - - m_top += 8; - } - - // -- tails for main pass - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - float B2r = B[bi + 2 * 2 + 0]; - float B2i = B[bi + 2 * 2 + 1]; - float B3r = B[bi + 3 * 2 + 0]; - float B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k - // leaving 6 vector registers for temporaries - // performing 2 operations between reuses of temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - vfloat32m2_t ACC2r = tmp0r; - vfloat32m2_t ACC2i = tmp0i; - vfloat32m2_t ACC3r = tmp1r; - vfloat32m2_t ACC3i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - B2r = B[bi + 2 * 2 + 0]; - B2i = B[bi + 2 * 2 + 1]; - B3r = B[bi + 3 * 2 + 0]; - B3i = B[bi + 3 * 2 + 1]; - bi += 4 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); - tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); - ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); - ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); - ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); - ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); - C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); - C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); - C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); - C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); - C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); - C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - float result8 = 0; - float result9 = 0; - float result10 = 0; - float result11 = 0; - float result12 = 0; - float result13 = 0; - float result14 = 0; - float result15 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; - result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; - result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; - result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; - result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; - result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; - result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; - result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; - result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; - result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; - ai += 2 * 2; - bi += 4 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; - Cr += result8 * alphar; - Ci += result9 * alphar; - Cr -= result9 * alphai; - Ci += result8 * alphai; - C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; - Cr += result10 * alphar; - Ci += result11 * alphar; - Cr -= result11 * alphai; - Ci += result10 * alphai; - C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; - Cr += result12 * alphar; - Ci += result13 * alphar; - Cr -= result13 * alphai; - Ci += result12 * alphai; - C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; - Cr += result14 * alphar; - Ci += result15 * alphar; - Cr -= result15 * alphai; - Ci += result14 * alphai; - C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; - result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; - result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; - ai += 1 * 2; - bi += 4 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 4; - } - - // -- tails for N=2 - - if (N & 2) { - gvl = __riscv_vsetvl_e32m2(8); - m_top = 0; - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k - // leaving 10 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - - m_top += 8; - } - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - float B1r = B[bi + 1 * 2 + 0]; - float B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k - // leaving 10 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - vfloat32m2_t ACC1r = tmp1r; - vfloat32m2_t ACC1i = tmp1i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - B1r = B[bi + 1 * 2 + 0]; - B1i = B[bi + 1 * 2 + 1]; - bi += 2 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); - tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); - tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); - ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - ci += ldc - gvl * 0; - vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); - C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); - C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - ci += ldc - gvl * 0; - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - float result4 = 0; - float result5 = 0; - float result6 = 0; - float result7 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; - result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; - ai += 2 * 2; - bi += 2 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result4 * alphar; - Ci += result5 * alphar; - Cr -= result5 * alphai; - Ci += result4 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; - Cr += result6 * alphar; - Ci += result7 * alphar; - Cr -= result7 * alphai; - Ci += result6 * alphai; - C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; - result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; - ai += 1 * 2; - bi += 2 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 2; - } - - // -- tails for N=1 - - if (N & 1) { - gvl = __riscv_vsetvl_e32m2(8); - m_top = 0; - - for (BLASLONG i = 0; i < M / 8; i += 1) { - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k - // leaving 12 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 8 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - - m_top += 8; - } - - if (M & 4) { - gvl = __riscv_vsetvl_e32m2(4); - - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - float B0r = B[bi + 0 * 2 + 0]; - float B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k - // leaving 12 vector registers for temporaries - vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - vfloat32m2_t ACC0r = tmp0r; - vfloat32m2_t ACC0i = tmp0i; - - for (BLASLONG k = 1; k < K; k++) { - B0r = B[bi + 0 * 2 + 0]; - B0i = B[bi + 0 * 2 + 1]; - bi += 1 * 2; - - A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); - A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); - ai += 4 * 2; - - tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); - tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); - tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); - tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); - ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); - ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); - } - - BLASLONG ci = n_top * ldc + m_top; - - vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); - vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); - - C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); - C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); - C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); - C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); - - ci = n_top * ldc + m_top; - - __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); - __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); - - m_top += 4; - } - - if (M & 2) { - float result0 = 0; - float result1 = 0; - float result2 = 0; - float result3 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; - result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; - ai += 2 * 2; - bi += 1 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; - Cr += result2 * alphar; - Ci += result3 * alphar; - Cr -= result3 * alphai; - Ci += result2 * alphai; - C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; - m_top += 2; - } - - if (M & 1) { - float result0 = 0; - float result1 = 0; - BLASLONG ai = m_top * K * 2; - BLASLONG bi = n_top * K * 2; - - for (BLASLONG k = 0; k < K; k++) { - result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; - result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; - ai += 1 * 2; - bi += 1 * 2; - } - - BLASLONG ci = n_top * ldc + m_top; - float Cr, Ci; - Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; - Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; - Cr += result0 * alphar; - Ci += result1 * alphar; - Cr -= result1 * alphai; - Ci += result0 * alphai; - C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; - C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; - m_top += 1; - } - - n_top += 1; - } - - return 0; -} diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c deleted file mode 100644 index 7980c029a4..0000000000 --- a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c +++ /dev/null @@ -1,1931 +0,0 @@ -/* - -AUTOGENERATED KERNEL -Settings: - LMUL=1 - M=8 - M_tail_scalar_from=1 - N=8 - __riscv_='__riscv_' - complex=True - conjugate=False - cpu='zvl256b' - force_acc_double=False - index_type='BLASLONG' - op='gemm' - param_precision='float' - reg_width_bits=256 - tail_policy='' - trace=False - -Derived: - ELEN_ACC=32 - ELEN_PARAM=32 - LMUL_ACC=1 - VFMACC='__riscv_vfmacc_vf_f32m1' - VFMUL='__riscv_vfmul_vf_f32m1' - VLEV='__riscv_vle32_v_f32m1' - VLSEV='__riscv_vlse32_v_f32m1' - VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1' - VMUL_TO_ACC='__riscv_vfmul_vf_f32m1' - VSETVL='__riscv_vsetvl_e32m1' - VSEV='__riscv_vse32_v_f32m1' - VSSEV='__riscv_vsse32_v_f32m1' - acc_vector_t='vfloat32m1_t' - output='cgemm_kernel_8x8_zvl256b.c' - param_scalar_t='float' - param_vector_t='vfloat32m1_t' - -*/ - -#include "common.h" - - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define S0 1 - #define S1 -1 - #define S2 1 - #define S3 1 - #define VFMACC_RR __riscv_vfmsac - #define VFMACC_RI __riscv_vfmacc -#endif -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) - #define S0 1 - #define S1 1 - #define S2 1 - #define S3 -1 - #define VFMACC_RR __riscv_vfmacc - #define VFMACC_RI __riscv_vfmsac -#endif -#if defined(RN) || defined(RT) || defined(CN) || defined(CT) - #define S0 1 - #define S1 1 - #define S2 -1 - #define S3 1 - #define VFMACC_RR __riscv_vfmacc - #define VFMACC_RI __riscv_vfnmsac -#endif -#if defined(RR) || defined(RC) || defined(CR) || defined(CC) - #define S0 1 - #define S1 -1 - #define S2 -1 - #define S3 -1 - #define VFMACC_RR __riscv_vfmsac - #define VFMACC_RI __riscv_vfnmacc -#endif - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - - // -- MAIN PASS - - for (BLASLONG j=0; j 4 + if (N & 4) + n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 4); +#endif +#if N_BLOCKSIZE > 2 + if (N & 2) + n_top = kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 2); +#endif +#if N_BLOCKSIZE > 1 + if (N & 1) + kernel_column (M, N, K, alpha, A, B, C, ldc, n_top, 1); +#endif + + return 0; +} diff --git a/kernel/riscv64/gemm_tcopy_16_rvv_max.c b/kernel/riscv64/gemm_tcopy_16_rvv_max.c new file mode 100644 index 0000000000..85975e28be --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_16_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#else +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#endif + +#define BLOCKSIZE 16 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_4_rvv_max.c b/kernel/riscv64/gemm_tcopy_4_rvv_max.c new file mode 100644 index 0000000000..15af02bd86 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_4_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m2 +#endif + +#define BLOCKSIZE 4 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_8_rvv_max.c b/kernel/riscv64/gemm_tcopy_8_rvv_max.c new file mode 100644 index 0000000000..0aa0358369 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_8_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 8 + +#include "gemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/gemm_tcopy_rvv_max_common.h b/kernel/riscv64/gemm_tcopy_rvv_max_common.h new file mode 100644 index 0000000000..425650cb35 --- /dev/null +++ b/kernel/riscv64/gemm_tcopy_rvv_max_common.h @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define xstr(s) str(s) +#define str(s) #s + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + IFLOAT *aoffset = a; + IFLOAT *boffset = b; + IFLOAT *boffset2 = b + m * (n & ~(BLOCKSIZE - 1)); + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda); + + for(BLASLONG j = m; j > 0; j--) { + IFLOAT *aoffset1 = aoffset; + IFLOAT *boffset1 = boffset; + + aoffset += lda; + boffset += BLOCKSIZE; + + for(BLASLONG i = n / BLOCKSIZE; i > 0; i--) { + size_t vl = BLOCKSIZE; + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v, vl); + + aoffset1 += BLOCKSIZE; + boffset1 += BLOCKSIZE * m; + } + + if (n & (BLOCKSIZE - 1)) { + size_t vl = n & (BLOCKSIZE - 1); + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v, vl); + + boffset2 += vl; + } + } + + return 0; +} diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c deleted file mode 100644 index e22df34f99..0000000000 --- a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c +++ /dev/null @@ -1,1081 +0,0 @@ -/* - -AUTOGENERATED KERNEL -Settings: - LMUL=1 - M=16 - M_tail_scalar_from=2 - N=8 - __riscv_='__riscv_' - complex=False - conjugate=False - cpu='zvl256b' - force_acc_double=False - index_type='BLASLONG' - op='gemm' - param_precision='float' - reg_width_bits=256 - tail_policy='' - trace=False - -Derived: - ELEN_ACC=32 - ELEN_PARAM=32 - LMUL_ACC=1 - VFMACC='__riscv_vfmacc_vf_f32m1' - VFMUL='__riscv_vfmul_vf_f32m1' - VLEV='__riscv_vle32_v_f32m1' - VLSEV='__riscv_vlse32_v_f32m1' - VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1' - VMUL_TO_ACC='__riscv_vfmul_vf_f32m1' - VSETVL='__riscv_vsetvl_e32m1' - VSEV='__riscv_vse32_v_f32m1' - VSSEV='__riscv_vsse32_v_f32m1' - acc_vector_t='vfloat32m1_t' - output='sgemm_kernel_16x8_zvl256b.c' - param_scalar_t='float' - param_vector_t='vfloat32m1_t' - -*/ - -#include "common.h" - - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - - // -- MAIN PASS - - for (BLASLONG j=0; j 4 + if (N & 4) + n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 4); +#endif +#if N_BLOCKSIZE > 2 + if (N & 2) + n_top = kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 2); +#endif +#if N_BLOCKSIZE > 1 + if (N & 1) + kernel_column (M, N, K, alphar, alphai, A, B, C, ldc, n_top, 1); +#endif + return 0; +} diff --git a/kernel/riscv64/zgemm_kernel_zvl128b.c b/kernel/riscv64/zgemm_kernel_zvl128b.c new file mode 100644 index 0000000000..ad8773cc4b --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_zvl128b.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N + +#if M_BLOCKSIZE == 2 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_LOAD __riscv_vlse64_v_f64m1 +#define RVV_STORE __riscv_vsse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_LOAD __riscv_vlse64_v_f64m2 +#define RVV_STORE __riscv_vsse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_LOAD __riscv_vlse64_v_f64m4 +#define RVV_STORE __riscv_vsse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/zgemm_kernel_zvl256b.c b/kernel/riscv64/zgemm_kernel_zvl256b.c new file mode 100644 index 0000000000..084bedb080 --- /dev/null +++ b/kernel/riscv64/zgemm_kernel_zvl256b.c @@ -0,0 +1,52 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define M_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_M +#define N_BLOCKSIZE ZGEMM_DEFAULT_UNROLL_N + +#if M_BLOCKSIZE == 4 +#define RVV_MUL __riscv_vfmul_vf_f64m1 +#define RVV_LOAD __riscv_vlse64_v_f64m1 +#define RVV_STORE __riscv_vsse64_v_f64m1 +#define VECTOR_T vfloat64m1_t +#elif M_BLOCKSIZE == 8 +#define RVV_MUL __riscv_vfmul_vf_f64m2 +#define RVV_LOAD __riscv_vlse64_v_f64m2 +#define RVV_STORE __riscv_vsse64_v_f64m2 +#define VECTOR_T vfloat64m2_t +#elif M_BLOCKSIZE == 16 +#define RVV_MUL __riscv_vfmul_vf_f64m4 +#define RVV_LOAD __riscv_vlse64_v_f64m4 +#define RVV_STORE __riscv_vsse64_v_f64m4 +#define VECTOR_T vfloat64m4_t +#else +#error "Unsupported M_BLOCKSIZE value" +#endif + +#include "zgemm_kernel_rvv_vlv_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_16_rvv_max.c b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c new file mode 100644 index 0000000000..8545989257 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_16_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT __riscv_vle32_v_f32m4 +#define VSEV_FLOAT __riscv_vse32_v_f32m4 +#else +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#endif + +#define BLOCKSIZE 16 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv_max.c b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c new file mode 100644 index 0000000000..07b86a4862 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_4_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 4 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_8_rvv_max.c b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c new file mode 100644 index 0000000000..53b28a5652 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_8_rvv_max.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#else +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#endif + +#define BLOCKSIZE 8 + +#include "zgemm_tcopy_rvv_max_common.h" diff --git a/kernel/riscv64/zgemm_tcopy_rvv_max_common.h b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h new file mode 100644 index 0000000000..c0417a5e37 --- /dev/null +++ b/kernel/riscv64/zgemm_tcopy_rvv_max_common.h @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define xstr(s) str(s) +#define str(s) #s + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) +{ + IFLOAT *aoffset = a; + IFLOAT *boffset = b; + IFLOAT *boffset2 = b + m * (n & ~(BLOCKSIZE - 1)) * 2; + + //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", xstr(CNAME), m, n, lda); + + for (BLASLONG j = m; j > 0; j--) { + IFLOAT *aoffset1 = aoffset; + IFLOAT *boffset1 = boffset; + + aoffset += lda * 2; + boffset += BLOCKSIZE * 2; + + for (BLASLONG i = n / BLOCKSIZE; i > 0; i--) { + size_t vl = BLOCKSIZE; + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset1, v, vl); + v = VLEV_FLOAT(aoffset1 + vl, vl); + VSEV_FLOAT(boffset1 + vl, v, vl); + + aoffset1 += BLOCKSIZE * 2; + boffset1 += BLOCKSIZE * m * 2; + } + + if (n & (BLOCKSIZE - 1)) { + size_t vl = n & (BLOCKSIZE - 1); + + FLOAT_V_T v = VLEV_FLOAT(aoffset1, vl); + VSEV_FLOAT(boffset2, v, vl); + v = VLEV_FLOAT(aoffset1 + vl, vl); + VSEV_FLOAT(boffset2 + vl, v, vl); + + boffset2 += vl * 2; + } + } + + return 0; +} diff --git a/param.h b/param.h index 8e598d8a01..8caa997c14 100644 --- a/param.h +++ b/param.h @@ -3231,7 +3231,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -3240,6 +3240,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define DTRMM_DEFAULT_UNROLL_M 8 +#define DTRMM_DEFAULT_UNROLL_N 8 + #undef SHGEMM_DEFAULT_P #define SHGEMM_DEFAULT_P 128 #undef SBGEMM_DEFAULT_P