Skip to content

Conversation

@phorcys
Copy link
Contributor

@phorcys phorcys commented Aug 15, 2025

MASKMOVDQU,VMASKMOVDQU
VPMASKMOVD/Q
MASKMOVQ
VMOVMSK{PD,PS}
VPACKUSWB
VPACKUSDW
PSHUF{LW,HW}
VPSHUF{LW,HW}
VPSHUFD
SHUFPS
VSHUFPS

@rmjskhy
Copy link

rmjskhy commented Aug 21, 2025

Hello, thank you for your contribution to improving translation efficiency.
However, I found a few issues in your submission:

In the translation function of vmovmskps/d, the instruction vpickve2gr cannot move the data from the third 32-bit element into the destination register; the xvpickve2gr instruction should be used instead.

In the translation function of vpackusxx, when the destination operand is memory, mask of 0 does not clear the target location.

I think a more correct way of writing it would be as follows:

bool translate_vpmaskmovx(IR1_INST * pir1) {
    IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0);
    IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1);
    IR1_OPND * opnd2 = ir1_get_opnd(pir1, 2);
    IR2_INST * ( * tr_inst1)(IR2_OPND, IR2_OPND, int);
    tr_inst1 = NULL;
    IR1_OPCODE op = ir1_opcode(pir1);
    if (ir1_opnd_is_xmm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_vslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_vslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg128_from_ir1(opnd0);
        IR2_OPND src1 = load_freg128_from_ir1(opnd1);
        IR2_OPND src2 = load_freg128_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_vxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_vbitsel_v(dest, dest, src2, mask);
            store_freg128_to_ir1_mem(dest, opnd0);
        } else {
            la_vbitsel_v(dest, zero, src2, mask);
            set_high128_xreg_to_zero(dest);
        }
    } else if (ir1_opnd_is_ymm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_xvslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_xvslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg256_from_ir1(opnd0);
        IR2_OPND src1 = load_freg256_from_ir1(opnd1);
        IR2_OPND src2 = load_freg256_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_xvxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_xvbitsel_v(dest, dest, src2, mask);
            store_freg256_to_ir1_mem(dest, opnd0);
        } else {
            la_xvbitsel_v(dest, zero, src2, mask);
        }
    }
    return true;
}

bool translate_vmovmskps(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_w(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_w(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 7, 4);
    }
    return true;
}

bool translate_vmovmskpd(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_d(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_d(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 3, 2);
    }
    return true;
}

@specialpointcentral
Copy link
Contributor

Hello, thank you for your contribution to improving translation efficiency. However, I found a few issues in your submission:↳

In the translation function of vmovmskps/d, the instruction vpickve2gr cannot move the data from the third 32-bit element into the destination register; the xvpickve2gr instruction should be used instead.↳

In the translation function of vpackusxx, when the destination operand is memory, mask of 0 does not clear the target location.↳

I think a more correct way of writing it would be as follows:↳

bool translate_vpmaskmovx(IR1_INST * pir1) {
    IR1_OPND * opnd0 = ir1_get_opnd(pir1, 0);
    IR1_OPND * opnd1 = ir1_get_opnd(pir1, 1);
    IR1_OPND * opnd2 = ir1_get_opnd(pir1, 2);
    IR2_INST * ( * tr_inst1)(IR2_OPND, IR2_OPND, int);
    tr_inst1 = NULL;
    IR1_OPCODE op = ir1_opcode(pir1);
    if (ir1_opnd_is_xmm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_vslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_vslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg128_from_ir1(opnd0);
        IR2_OPND src1 = load_freg128_from_ir1(opnd1);
        IR2_OPND src2 = load_freg128_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_vxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_vbitsel_v(dest, dest, src2, mask);
            store_freg128_to_ir1_mem(dest, opnd0);
        } else {
            la_vbitsel_v(dest, zero, src2, mask);
            set_high128_xreg_to_zero(dest);
        }
    } else if (ir1_opnd_is_ymm(opnd1)) {
        switch (op) {
            case dt_X86_INS_VPMASKMOVD:
                tr_inst1 = la_xvslti_w;
                break;
            case dt_X86_INS_VPMASKMOVQ:
                tr_inst1 = la_xvslti_d;
                break;
            default:
                break;

        }
        IR2_OPND dest = load_freg256_from_ir1(opnd0);
        IR2_OPND src1 = load_freg256_from_ir1(opnd1);
        IR2_OPND src2 = load_freg256_from_ir1(opnd2);
        IR2_OPND zero = ra_alloc_ftemp();
        IR2_OPND mask = ra_alloc_ftemp();
        la_xvxor_v(zero, zero, zero);
        tr_inst1(mask, src1, 0);
        if (ir1_opnd_is_mem(opnd0)) {
            la_xvbitsel_v(dest, dest, src2, mask);
            store_freg256_to_ir1_mem(dest, opnd0);
        } else {
            la_xvbitsel_v(dest, zero, src2, mask);
        }
    }
    return true;
}

bool translate_vmovmskps(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_w(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_w(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 7, 4);
    }
    return true;
}

bool translate_vmovmskpd(IR1_INST * pir1) {
    lsassert(ir1_opnd_is_gpr(ir1_get_opnd(pir1, 0)));
    lsassert(ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1)) || ir1_opnd_is_ymm(ir1_get_opnd(pir1, 1)));

    IR2_OPND dest = ra_alloc_gpr(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 0)));
    if (ir1_opnd_is_xmm(ir1_get_opnd(pir1, 1))) {
        IR2_OPND temp = ra_alloc_ftemp();
        la_vmskltz_d(temp,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_movfr2gr_d(dest, temp);
    } else {
        IR2_OPND temp1 = ra_alloc_ftemp();
        IR2_OPND dest_hi = ra_alloc_itemp();

        la_xvmskltz_d(temp1,
            ra_alloc_xmm(ir1_opnd_base_reg_num(ir1_get_opnd(pir1, 1))));
        la_vpickve2gr_du(dest, temp1, 0);
        la_xvpickve2gr_du(dest_hi, temp1, 2);
        la_bstrins_d(dest, dest_hi, 3, 2);
    }
    return true;
}

cc @rmjskhy

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants