diff --git a/API_CHANGES b/API_CHANGES index 77c12f76..51cca049 100644 --- a/API_CHANGES +++ b/API_CHANGES @@ -1,5 +1,9 @@ This file is the short summary of the API changes: +10.06.2024 - Non-backward compatible + The sljit_emit_simd_op2() has a generic + second operand. + 20.03.2024 - Non-backward compatible The sljit_p type is renamed to sljit_up. diff --git a/sljit_src/sljitLir.c b/sljit_src/sljitLir.c index 2dca17cd..9b4e9bc2 100644 --- a/sljit_src/sljitLir.c +++ b/sljit_src/sljitLir.c @@ -1184,7 +1184,7 @@ static const char* fop2r_names[] = { }; static const char* simd_op2_names[] = { - "and", "or", "xor" + "and", "or", "xor", "shuffle" }; static const char* jump_names[] = { @@ -2953,37 +2953,43 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co } static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK(0)) <= SLJIT_SIMD_OP2_XOR); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK2(0)) <= SLJIT_SIMD_OP2_SHUFFLE); CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type)); + CHECK_ARGUMENT(SLJIT_SIMD_GET_OPCODE(type) != SLJIT_SIMD_OP2_SHUFFLE || (SLJIT_SIMD_GET_ELEM_SIZE(type) == 0 && !(type & SLJIT_SIMD_FLOAT))); + CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (src2 & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, 0)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_freg, 0)); - CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, 0)); + FUNCTION_FCHECK(src2, src2w, 0); #endif #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) if (SLJIT_UNLIKELY(!!compiler->verbose)) { if (type & SLJIT_SIMD_TEST) CHECK_RETURN_OK; - if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2_freg) == SLJIT_ERR_UNSUPPORTED) { + if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2, src2w) == SLJIT_ERR_UNSUPPORTED) { fprintf(compiler->verbose, " # simd_op2: unsupported form, no instructions are emitted\n"); CHECK_RETURN_OK; } - fprintf(compiler->verbose, " simd_%s.%d.%s%d ", + fprintf(compiler->verbose, " simd_%s.%d.%s%d", simd_op2_names[SLJIT_SIMD_GET_OPCODE(type) - 1], (8 << SLJIT_SIMD_GET_REG_SIZE(type)), (type & SLJIT_SIMD_FLOAT) ? "f" : "", (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); + if ((type & 0x3f000000) != SLJIT_SIMD_MEM_UNALIGNED) + fprintf(compiler->verbose, ".al%d", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type))); + + fprintf(compiler->verbose, " "); sljit_verbose_freg(compiler, dst_freg); fprintf(compiler->verbose, ", "); sljit_verbose_freg(compiler, src1_freg); fprintf(compiler->verbose, ", "); - sljit_verbose_freg(compiler, src2_freg); + sljit_verbose_fparam(compiler, src2, src2w); fprintf(compiler->verbose, "\n"); } #endif @@ -3483,15 +3489,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w)); SLJIT_UNUSED_ARG(compiler); SLJIT_UNUSED_ARG(type); SLJIT_UNUSED_ARG(dst_freg); SLJIT_UNUSED_ARG(src1_freg); - SLJIT_UNUSED_ARG(src2_freg); + SLJIT_UNUSED_ARG(src2); + SLJIT_UNUSED_ARG(src2w); return SLJIT_ERR_UNSUPPORTED; } diff --git a/sljit_src/sljitLir.h b/sljit_src/sljitLir.h index 8b6fa69a..5991e374 100644 --- a/sljit_src/sljitLir.h +++ b/sljit_src/sljitLir.h @@ -1919,7 +1919,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler /* Element size is 256 bit long */ #define SLJIT_SIMD_ELEM_256 (5 << 18) -/* The following options are used by sljit_emit_simd_mov(). */ +/* The following options are used by sljit_emit_simd_mov() + and sljit_emit_simd_op2(). */ /* Memory address is unaligned (this is the default) */ #define SLJIT_SIMD_MEM_UNALIGNED (0 << 24) @@ -2096,6 +2097,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c #define SLJIT_SIMD_OP2_OR 0x000002 /* Binary 'xor' operation */ #define SLJIT_SIMD_OP2_XOR 0x000003 +/* Shuffle bytes of src1 using the indicies in src2 */ +#define SLJIT_SIMD_OP2_SHUFFLE 0x000004 /* Perform simd operations using simd registers. @@ -2103,16 +2106,17 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed, it does not emit any instructions. - type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_OP2_ - options except SLJIT_SIMD_LOAD and SLJIT_SIMD_STORE + type must be a combination of SLJIT_SIMD_*, SLJIT_SIMD_MEM_* + and SLJIT_SIMD_OP2_* options except SLJIT_SIMD_LOAD + and SLJIT_SIMD_STORE dst_freg is the destination register of the operation src1_freg is the first source register of the operation - src1_freg is the second source register of the operation + src2 is the second source operand of the operation Flags: - (does not modify flags) */ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg); + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w); /* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair can perform an atomic read-modify-write operation. First, an unsigned diff --git a/sljit_src/sljitNativeARM_32.c b/sljit_src/sljitNativeARM_32.c index 0b5928a5..642b1215 100644 --- a/sljit_src/sljitNativeARM_32.c +++ b/sljit_src/sljitNativeARM_32.c @@ -180,6 +180,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define VST1_s 0xf4800000 #define VSTR_F32 0xed000a00 #define VSUB_F32 0xee300a40 +#define VTBL 0xf3b00800 #if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7) /* Arm v7 specific instructions. */ @@ -4468,14 +4469,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_ins ins = 0; + sljit_s32 alignment; + sljit_ins ins = 0, load_ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4493,19 +4496,54 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = VEOR; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = VTBL; + break; } if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); + alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + + SLJIT_ASSERT(reg_size >= alignment); + + if (alignment == 3) + load_ins |= 0x10; + else if (alignment >= 4) + load_ins |= 0x20; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + FAIL_IF(push_inst(compiler, load_ins | VD(TMP_FREG2) | RN(src2) | ((sljit_ins)elem_size) << 6 | 0xf)); + src2 = TMP_FREG2; + } + if (reg_size == 4) { dst_freg = simd_get_quad_reg_index(dst_freg); src1_freg = simd_get_quad_reg_index(src1_freg); - src2_freg = simd_get_quad_reg_index(src2_freg); + src2 = simd_get_quad_reg_index(src2); + + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + ins |= (sljit_ins)1 << 8; + + FAIL_IF(push_inst(compiler, ins | VD(dst_freg != src1_freg ? dst_freg : TMP_FREG2) | VN(src1_freg) | VM(src2))); + src2 += SLJIT_QUAD_OTHER_HALF(src2); + FAIL_IF(push_inst(compiler, ins | VD(dst_freg + SLJIT_QUAD_OTHER_HALF(dst_freg)) | VN(src1_freg) | VM(src2))); + + if (dst_freg == src1_freg) + return push_inst(compiler, VORR | VD(dst_freg) | VN(TMP_FREG2) | VM(TMP_FREG2)); + return SLJIT_SUCCESS; + } + ins |= (sljit_ins)1 << 6; } - return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); + return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2)); } #undef FPU_LOAD diff --git a/sljit_src/sljitNativeARM_64.c b/sljit_src/sljitNativeARM_64.c index 5331ebdf..c79cc742 100644 --- a/sljit_src/sljitNativeARM_64.c +++ b/sljit_src/sljitNativeARM_64.c @@ -171,6 +171,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { #define SUBI 0xd1000000 #define SUBS 0xeb000000 #define TBZ 0x36000000 +#define TBL_v 0x0e000000 #define UBFM 0xd3400000 #define UCVTF 0x9e630000 #define UDIV 0x9ac00800 @@ -3224,14 +3225,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); sljit_ins ins = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -3249,15 +3251,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = EOR_v; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = TBL_v; + break; } if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + push_inst(compiler, LD1 | (reg_size == 4 ? (1 << 30) : 0) | ((sljit_ins)elem_size << 10) | RN(src2) | VT(TMP_FREG1)); + src2 = TMP_FREG1; + } + if (reg_size == 4) ins |= (sljit_ins)1 << 30; - return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); + return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, diff --git a/sljit_src/sljitNativeARM_T2_32.c b/sljit_src/sljitNativeARM_T2_32.c index 799954a8..92279ec8 100644 --- a/sljit_src/sljitNativeARM_T2_32.c +++ b/sljit_src/sljitNativeARM_T2_32.c @@ -253,6 +253,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) #define VST1_s 0xf9800000 #define VSTR_F32 0xed000a00 #define VSUB_F32 0xee300a40 +#define VTBL 0xffb00800 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) @@ -4143,14 +4144,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_ins ins = 0; + sljit_s32 alignment; + sljit_ins ins = 0, load_ins; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); if (reg_size != 3 && reg_size != 4) return SLJIT_ERR_UNSUPPORTED; @@ -4168,19 +4171,54 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co case SLJIT_SIMD_OP2_XOR: ins = VEOR; break; + case SLJIT_SIMD_OP2_SHUFFLE: + ins = VTBL; + break; } if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; + if (src2 & SLJIT_MEM) { + if (elem_size > 3) + elem_size = 3; + + load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8)); + alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + + SLJIT_ASSERT(reg_size >= alignment); + + if (alignment == 3) + load_ins |= 0x10; + else if (alignment >= 4) + load_ins |= 0x20; + + FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w)); + FAIL_IF(push_inst32(compiler, load_ins | VD4(TMP_FREG2) | RN4(src2) | ((sljit_ins)elem_size) << 6 | 0xf)); + src2 = TMP_FREG2; + } + if (reg_size == 4) { dst_freg = simd_get_quad_reg_index(dst_freg); src1_freg = simd_get_quad_reg_index(src1_freg); - src2_freg = simd_get_quad_reg_index(src2_freg); + src2 = simd_get_quad_reg_index(src2); + + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + ins |= (sljit_ins)1 << 8; + + FAIL_IF(push_inst32(compiler, ins | VD4(dst_freg != src1_freg ? dst_freg : TMP_FREG2) | VN4(src1_freg) | VM4(src2))); + src2 += SLJIT_QUAD_OTHER_HALF(src2); + FAIL_IF(push_inst32(compiler, ins | VD4(dst_freg + SLJIT_QUAD_OTHER_HALF(dst_freg)) | VN4(src1_freg) | VM4(src2))); + + if (dst_freg == src1_freg) + return push_inst32(compiler, VORR | VD4(dst_freg) | VN4(TMP_FREG2) | VM4(TMP_FREG2)); + return SLJIT_SUCCESS; + } + ins |= (sljit_ins)1 << 6; } - return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2_freg)); + return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2)); } #undef FPU_LOAD diff --git a/sljit_src/sljitNativeX86_common.c b/sljit_src/sljitNativeX86_common.c index ecb7e9be..ebc49137 100644 --- a/sljit_src/sljitNativeX86_common.c +++ b/sljit_src/sljitNativeX86_common.c @@ -239,6 +239,7 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { #define MOVDDUP_x_xm 0x12 #define MOVDQA_x_xm 0x6f #define MOVDQA_xm_x 0x7f +#define MOVDQU_x_xm 0x6f #define MOVHLPS_x_x 0x12 #define MOVHPD_m_x 0x17 #define MOVHPD_x_m 0x16 @@ -4689,14 +4690,17 @@ static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, - sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 use_vex = (cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX); sljit_uw op = 0; + sljit_uw mov_op = 0; CHECK_ERROR(); - CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w)); + ADJUST_LOCAL_OFFSET(src2, src2w); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -4730,27 +4734,52 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) op |= EX86_PREF_66; break; + + case SLJIT_SIMD_OP2_SHUFFLE: + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + op = PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38; + break; } if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; - if (reg_size == 5 || ((cpu_feature_list & CPU_FEATURE_AVX) && (compiler->options & SLJIT_ENTER_USE_VEX))) { + if ((src2 & SLJIT_MEM) && SLJIT_SIMD_GET_ELEM2_SIZE(type) < reg_size) { + mov_op = ((type & SLJIT_SIMD_FLOAT) ? (MOVUPS_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0)) : (MOVDQU_x_xm | EX86_PREF_F3)) | EX86_SSE2; + if (use_vex) + FAIL_IF(emit_vex_instruction(compiler, mov_op, TMP_FREG, 0, src2, src2w)); + else + FAIL_IF(emit_groupf(compiler, mov_op, TMP_FREG, src2, src2w)); + + src2 = TMP_FREG; + src2w = 0; + } + + if (reg_size == 5 || use_vex) { if (reg_size == 5) op |= VEX_256; - return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0); + return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2, src2w); } if (dst_freg != src1_freg) { - if (dst_freg == src2_freg) - src2_freg = src1_freg; - else + if (dst_freg == src2) { + if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) { + FAIL_IF(emit_simd_mov(compiler, type, TMP_FREG, src2)); + FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg)); + src2 = TMP_FREG; + src2w = 0; + } else + src2 = src1_freg; + } else FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg)); } - FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0)); - return SLJIT_SUCCESS; + if (op & (VEX_OP_0F38 | VEX_OP_0F3A)) + return emit_groupf_ext(compiler, op | EX86_SSE2, dst_freg, src2, src2w); + return emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, diff --git a/test_src/sljitTest.c b/test_src/sljitTest.c index b7c6f527..e250112f 100644 --- a/test_src/sljitTest.c +++ b/test_src/sljitTest.c @@ -8822,10 +8822,11 @@ int sljit_test(int argc, char* argv[]) test_simd7(); test_simd8(); test_simd9(); + test_simd10(); } else { if (verbose) printf("no simd available, simd tests are skipped\n"); - successful_tests += 9; + successful_tests += 10; } if (verbose) @@ -8839,7 +8840,7 @@ int sljit_test(int argc, char* argv[]) sljit_free_unused_memory_exec(); #endif -# define TEST_COUNT 121 +# define TEST_COUNT 122 printf("SLJIT tests: "); if (successful_tests == TEST_COUNT) diff --git a/test_src/sljitTestSimd.h b/test_src/sljitTestSimd.h index 12834cbd..4228b81f 100644 --- a/test_src/sljitTestSimd.h +++ b/test_src/sljitTestSimd.h @@ -2273,85 +2273,85 @@ static void test_simd8(void) type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2, 0); /* buf[64] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 64); type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_FLOAT; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, SLJIT_FR2, SLJIT_FR0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, SLJIT_FR2, SLJIT_FR0, SLJIT_FR2, 0); /* buf[80] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 80); type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_16; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, fs0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR4, fs0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR4, fs0, SLJIT_FR2, 0); /* buf[96] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 96); type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_64 | SLJIT_SIMD_FLOAT; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 32); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 0); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR1, SLJIT_FR2, SLJIT_FR0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR1, SLJIT_FR2, SLJIT_FR0, 0); /* buf[112] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR1, SLJIT_MEM1(SLJIT_S0), 112); type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_128; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, fs0, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, fs0, SLJIT_FR0, fs0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, fs0, SLJIT_FR0, fs0, 0); /* buf[128] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, fs0, SLJIT_MEM1(SLJIT_S0), 128); type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_FLOAT; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR2, SLJIT_FR4, SLJIT_FR0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR2, SLJIT_FR4, SLJIT_FR0, 0); /* buf[144] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 144); type = SLJIT_SIMD_REG_64 | SLJIT_SIMD_ELEM_32; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR4, SLJIT_FR0, SLJIT_FR4); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR4, SLJIT_FR0, SLJIT_FR4, 0); /* buf[160] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 160); type = SLJIT_SIMD_REG_64 | SLJIT_SIMD_ELEM_64 | SLJIT_SIMD_FLOAT; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, fs0, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, SLJIT_FR0, SLJIT_FR2, fs0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, SLJIT_FR0, SLJIT_FR2, fs0, 0); /* buf[168] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 168); type = SLJIT_SIMD_REG_64 | SLJIT_SIMD_ELEM_64; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, fs0, SLJIT_FR0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, fs0, SLJIT_FR0, SLJIT_FR2, 0); /* buf[176] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, fs0, SLJIT_MEM1(SLJIT_S0), 176); type = SLJIT_SIMD_REG_256 | SLJIT_SIMD_ELEM_8; - supported[0] = sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type | SLJIT_SIMD_TEST, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2) != SLJIT_ERR_UNSUPPORTED; + supported[0] = sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type | SLJIT_SIMD_TEST, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2, 0) != SLJIT_ERR_UNSUPPORTED; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR2, 0); /* buf[192] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 192); type = SLJIT_SIMD_REG_256 | SLJIT_SIMD_ELEM_256; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, fs0, SLJIT_FR0, SLJIT_FR2); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, fs0, SLJIT_FR0, SLJIT_FR2, 0); /* buf[224] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, fs0, SLJIT_MEM1(SLJIT_S0), 224); type = SLJIT_SIMD_REG_256 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_FLOAT; sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR1, SLJIT_MEM1(SLJIT_S0), 0); sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR3, SLJIT_MEM1(SLJIT_S0), 32); - sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR3, SLJIT_FR1, SLJIT_FR3); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR3, SLJIT_FR1, SLJIT_FR3, 0); /* buf[256] */ sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR3, SLJIT_MEM1(SLJIT_S0), 256); @@ -2512,5 +2512,114 @@ static void test_simd9(void) successful_tests++; } +static void test_simd10(void) +{ + /* Test simd binary logical operation with memory operands. */ + executable_code code; + struct sljit_compiler* compiler; + sljit_s32 options = 0; + sljit_s32 i, type; + sljit_u8* buf; + sljit_u8 data[63 + 288]; + sljit_s32 fs0 = SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS > 0 ? SLJIT_FS1 : SLJIT_FR5; + + if (verbose) + printf("Run test_simd10\n"); + + SIMD_RUN_START + + /* Buffer is 64 byte aligned. */ + buf = (sljit_u8*)(((sljit_sw)data + (sljit_sw)63) & ~(sljit_sw)63); + + for (i = 0; i < 288; i++) + buf[i] = 0xaa; + + init_simd_u32(buf, 32, 0x00ff00ff); + init_simd_u32(buf + 32, 32, 0x0000ffff); + init_simd_u32(buf + 64, 32, LITTLE_BIG(0x04050607, 0x07060504)); + *(sljit_u32*)(buf + 96 + 4) = LITTLE_BIG(0x12345678, 0x78563412); + init_simd_u32(buf + 128, 32, LITTLE_BIG(0x080b090a, 0x0a090b08)); + *(sljit_u32*)(buf + 160 + 8) = LITTLE_BIG(0x11223344, 0x44332211); + + compiler = sljit_create_compiler(NULL); + FAILED(!compiler, "cannot create compiler\n"); + + sljit_emit_enter(compiler, options, SLJIT_ARGS3V(P, P, P), 4, 4, 6, SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS > 0 ? 2 : 0, 32); + + type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8 | SLJIT_SIMD_MEM_ALIGNED_64; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | type, SLJIT_FR0, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 32); + /* buf[192] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 192); + + type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_32 | SLJIT_SIMD_MEM_ALIGNED_128; + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, SLJIT_IMM, 32 >> 2); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 0); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_XOR | type, SLJIT_FR4, SLJIT_FR2, SLJIT_MEM2(SLJIT_S0, SLJIT_R1), 2); + /* buf[208] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 208); + + type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_FLOAT | SLJIT_SIMD_ELEM_64; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 0); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_SP), 16); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR3, SLJIT_MEM1(SLJIT_S0), 32); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_OR | type, fs0, SLJIT_FR3, SLJIT_MEM1(SLJIT_SP), 16); + /* buf[224] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, fs0, SLJIT_MEM1(SLJIT_S0), 224); + + type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR1, SLJIT_MEM1(SLJIT_S0), 64); + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 96); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_SHUFFLE | type, SLJIT_FR1, SLJIT_FR2, SLJIT_FR1, 0); + /* buf[240] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR1, SLJIT_MEM1(SLJIT_S0), 240); + + type = SLJIT_SIMD_REG_128 | SLJIT_SIMD_ELEM_8 | SLJIT_SIMD_MEM_ALIGNED_128; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 160); + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R2, 0, SLJIT_S0, 0, SLJIT_IMM, WCONST(0xcba2ba2ba2ba2ba2, 0xcba2ba2b) + 128); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_SHUFFLE | type, SLJIT_FR4, SLJIT_FR4, SLJIT_MEM1(SLJIT_R2), -WCONST(0xcba2ba2ba2ba2ba2, 0xcba2ba2b)); + /* buf[256] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR4, SLJIT_MEM1(SLJIT_S0), 256); + +#if IS_ARM + type = SLJIT_SIMD_REG_64 | SLJIT_SIMD_ELEM_8; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 96); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_SHUFFLE | type, SLJIT_FR2, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 64); + /* buf[272] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR2, SLJIT_MEM1(SLJIT_S0), 272); + + type = SLJIT_SIMD_REG_64 | SLJIT_SIMD_ELEM_8 | SLJIT_SIMD_MEM_ALIGNED_64; + sljit_emit_simd_mov(compiler, SLJIT_SIMD_LOAD | type, fs0, SLJIT_MEM1(SLJIT_S0), 96); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, 64); + sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_SHUFFLE | type, SLJIT_FR0, fs0, SLJIT_MEM2(SLJIT_S0, SLJIT_R0), 0); + /* buf[280] */ + sljit_emit_simd_mov(compiler, SLJIT_SIMD_STORE | type, SLJIT_FR0, SLJIT_MEM1(SLJIT_S0), 280); +#endif /* IS_ARM */ + + sljit_emit_return_void(compiler); + + code.code = sljit_generate_code(compiler, 0, NULL); + CHECK(compiler); + sljit_free_compiler(compiler); + + code.func1((sljit_sw)buf); + sljit_free_code(code.code, NULL); + + FAILED(!check_simd_u32(buf + 192, 16, 0x000000ff), "test_simd10 case 1 failed\n"); + FAILED(!check_simd_u32(buf + 208, 16, 0x00ffff00), "test_simd10 case 2 failed\n"); + FAILED(!check_simd_u32(buf + 224, 16, 0x00ffffff), "test_simd10 case 3 failed\n"); + FAILED(!check_simd_u32(buf + 240, 16, LITTLE_BIG(0x78563412, 0x12345678)), "test_simd10 case 4 failed\n"); + FAILED(!check_simd_u32(buf + 256, 16, LITTLE_BIG(0x44113322, 0x22331144)), "test_simd10 case 5 failed\n"); + +#if IS_ARM + FAILED(!check_simd_u32(buf + 272, 8, LITTLE_BIG(0x78563412, 0x12345678)), "test_simd10 case 6 failed\n"); + FAILED(!check_simd_u32(buf + 280, 8, LITTLE_BIG(0x78563412, 0x12345678)), "test_simd10 case 7 failed\n"); +#endif /* IS_ARM */ + + SIMD_RUN_END + + successful_tests++; +} + #undef SIMD_RUN_START #undef SIMD_RUN_END