LATX, opt: Opt some SSE/AVX ops. #109

phorcys · 2025-08-15T05:45:23Z

MASKMOVDQU,VMASKMOVDQU
VPMASKMOVD/Q
MASKMOVQ
VMOVMSK{PD,PS}
VPACKUSWB
VPACKUSDW
PSHUF{LW,HW}
VPSHUF{LW,HW}
VPSHUFD
SHUFPS
VSHUFPS

MASKMOVDQU,VMASKMOVDQU VPMASKMOVD/Q, MASKMOVQ

rmjskhy · 2025-08-21T08:40:54Z

Hello, thank you for your contribution to improving translation efficiency.
However, I found a few issues in your submission:

In the translation function of vmovmskps/d, the instruction vpickve2gr cannot move the data from the third 32-bit element into the destination register; the xvpickve2gr instruction should be used instead.

In the translation function of vpackusxx, when the destination operand is memory, mask of 0 does not clear the target location.

I think a more correct way of writing it would be as follows:

bool translate_vpmaskmovx(IR1_INST * pir1) {
    IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0);
    IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1);
    IR1_OPND * opnd2 = ir1_get_opnd(pir1, 2);
    IR2_INST * ( * tr_inst1)(IR2_OPND, IR2_OPND, int);
    tr_inst1 = NULL;
    IR1_OPCODE op = ir1_opcode(pir1);
    if (ir1_opnd_is_xmm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_vslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_vslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg128_from_ir1(opnd0);
        IR2_OPND src1 = load_freg128_from_ir1(opnd1);
        IR2_OPND src2 = load_freg128_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_vxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_vbitsel_v(dest, dest, src2, mask);
            store_freg128_to_ir1_mem(dest, opnd0);
        } else {
            la_vbitsel_v(dest, zero, src2, mask);
            set_high128_xreg_to_zero(dest);
        }
    } else if (ir1_opnd_is_ymm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_xvslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_xvslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg256_from_ir1(opnd0);
        IR2_OPND src1 = load_freg256_from_ir1(opnd1);
        IR2_OPND src2 = load_freg256_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_xvxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_xvbitsel_v(dest, dest, src2, mask);
            store_freg256_to_ir1_mem(dest, opnd0);
        } else {
            la_xvbitsel_v(dest, zero, src2, mask);
        }
    }
    return true;
}

bool translate_vmovmskps(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_w(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_w(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 7, 4);
    }
    return true;
}

bool translate_vmovmskpd(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_d(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_d(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 3, 2);
    }
    return true;
}

specialpointcentral · 2025-10-19T14:49:37Z

Hello, thank you for your contribution to improving translation efficiency. However, I found a few issues in your submission:↳

In the translation function of vmovmskps/d, the instruction vpickve2gr cannot move the data from the third 32-bit element into the destination register; the xvpickve2gr instruction should be used instead.↳

In the translation function of vpackusxx, when the destination operand is memory, mask of 0 does not clear the target location.↳

I think a more correct way of writing it would be as follows:↳

bool translate_vpmaskmovx(IR1_INST * pir1) {
    IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0);
    IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1);
    IR1_OPND * opnd2 = ir1_get_opnd(pir1, 2);
    IR2_INST * ( * tr_inst1)(IR2_OPND, IR2_OPND, int);
    tr_inst1 = NULL;
    IR1_OPCODE op = ir1_opcode(pir1);
    if (ir1_opnd_is_xmm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_vslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_vslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg128_from_ir1(opnd0);
        IR2_OPND src1 = load_freg128_from_ir1(opnd1);
        IR2_OPND src2 = load_freg128_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_vxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_vbitsel_v(dest, dest, src2, mask);
            store_freg128_to_ir1_mem(dest, opnd0);
        } else {
            la_vbitsel_v(dest, zero, src2, mask);
            set_high128_xreg_to_zero(dest);
        }
    } else if (ir1_opnd_is_ymm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_xvslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_xvslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg256_from_ir1(opnd0);
        IR2_OPND src1 = load_freg256_from_ir1(opnd1);
        IR2_OPND src2 = load_freg256_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_xvxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_xvbitsel_v(dest, dest, src2, mask);
            store_freg256_to_ir1_mem(dest, opnd0);
        } else {
            la_xvbitsel_v(dest, zero, src2, mask);
        }
    }
    return true;
}

bool translate_vmovmskps(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_w(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_w(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 7, 4);
    }
    return true;
}

bool translate_vmovmskpd(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_d(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_d(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 3, 2);
    }
    return true;
}

cc @rmjskhy

phorcys added 4 commits August 15, 2025 13:42

LATX, opt: Opt maskmov ops.

7613d3b

MASKMOVDQU,VMASKMOVDQU VPMASKMOVD/Q, MASKMOVQ

LATX, opt: Opt VMOVMSK{PD,PS}

a162646

LATX, opt: Opt vpackuswb, vpackusdw

04d2850

LATX, opt: Opt PSHUF{LW,HW} VPSHUF{LW,HW},VPSHUFD,SHUFPS,VSHUFPS

4cbf46f

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

LATX, opt: Opt some SSE/AVX ops. #109

LATX, opt: Opt some SSE/AVX ops. #109

Uh oh!

phorcys commented Aug 15, 2025

Uh oh!

rmjskhy commented Aug 21, 2025

Uh oh!

specialpointcentral commented Oct 19, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

LATX, opt: Opt some SSE/AVX ops. #109

Are you sure you want to change the base?

LATX, opt: Opt some SSE/AVX ops. #109

Uh oh!

Conversation

phorcys commented Aug 15, 2025

Uh oh!

rmjskhy commented Aug 21, 2025

Uh oh!

specialpointcentral commented Oct 19, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants