Skip to content

Commit

Permalink
Support simd shuffle operation
Browse files Browse the repository at this point in the history
  • Loading branch information
Zoltan Herczeg committed Jun 11, 2024
1 parent 1e05797 commit b3f4929
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 52 deletions.
4 changes: 4 additions & 0 deletions API_CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
This file is the short summary of the API changes:

10.06.2024 - Non-backward compatible
The sljit_emit_simd_op2() has a generic
second operand.

20.03.2024 - Non-backward compatible
The sljit_p type is renamed to sljit_up.

Expand Down
27 changes: 17 additions & 10 deletions sljit_src/sljitLir.c
Original file line number Diff line number Diff line change
Expand Up @@ -1184,7 +1184,7 @@ static const char* fop2r_names[] = {
};

static const char* simd_op2_names[] = {
"and", "or", "xor"
"and", "or", "xor", "shuffle"
};

static const char* jump_names[] = {
Expand Down Expand Up @@ -2953,37 +2953,43 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co
}

static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w)
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK(0)) <= SLJIT_SIMD_OP2_XOR);
CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK2(0)) <= SLJIT_SIMD_OP2_SHUFFLE);
CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_OPCODE(type) != SLJIT_SIMD_OP2_SHUFFLE || (SLJIT_SIMD_GET_ELEM_SIZE(type) == 0 && !(type & SLJIT_SIMD_FLOAT)));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (src2 & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0);
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, 0));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_freg, 0));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, 0));
FUNCTION_FCHECK(src2, src2w, 0);
#endif
#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
if (SLJIT_UNLIKELY(!!compiler->verbose)) {
if (type & SLJIT_SIMD_TEST)
CHECK_RETURN_OK;
if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2_freg) == SLJIT_ERR_UNSUPPORTED) {
if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2, src2w) == SLJIT_ERR_UNSUPPORTED) {
fprintf(compiler->verbose, " # simd_op2: unsupported form, no instructions are emitted\n");
CHECK_RETURN_OK;
}

fprintf(compiler->verbose, " simd_%s.%d.%s%d ",
fprintf(compiler->verbose, " simd_%s.%d.%s%d",
simd_op2_names[SLJIT_SIMD_GET_OPCODE(type) - 1],
(8 << SLJIT_SIMD_GET_REG_SIZE(type)),
(type & SLJIT_SIMD_FLOAT) ? "f" : "",
(8 << SLJIT_SIMD_GET_ELEM_SIZE(type)));

if ((type & 0x3f000000) != SLJIT_SIMD_MEM_UNALIGNED)
fprintf(compiler->verbose, ".al%d", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type)));

fprintf(compiler->verbose, " ");
sljit_verbose_freg(compiler, dst_freg);
fprintf(compiler->verbose, ", ");
sljit_verbose_freg(compiler, src1_freg);
fprintf(compiler->verbose, ", ");
sljit_verbose_freg(compiler, src2_freg);
sljit_verbose_fparam(compiler, src2, src2w);
fprintf(compiler->verbose, "\n");
}
#endif
Expand Down Expand Up @@ -3483,15 +3489,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w)
{
CHECK_ERROR();
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w));
SLJIT_UNUSED_ARG(compiler);
SLJIT_UNUSED_ARG(type);
SLJIT_UNUSED_ARG(dst_freg);
SLJIT_UNUSED_ARG(src1_freg);
SLJIT_UNUSED_ARG(src2_freg);
SLJIT_UNUSED_ARG(src2);
SLJIT_UNUSED_ARG(src2w);

return SLJIT_ERR_UNSUPPORTED;
}
Expand Down
14 changes: 9 additions & 5 deletions sljit_src/sljitLir.h
Original file line number Diff line number Diff line change
Expand Up @@ -1919,7 +1919,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler
/* Element size is 256 bit long */
#define SLJIT_SIMD_ELEM_256 (5 << 18)

/* The following options are used by sljit_emit_simd_mov(). */
/* The following options are used by sljit_emit_simd_mov()
and sljit_emit_simd_op2(). */

/* Memory address is unaligned (this is the default) */
#define SLJIT_SIMD_MEM_UNALIGNED (0 << 24)
Expand Down Expand Up @@ -2096,23 +2097,26 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
#define SLJIT_SIMD_OP2_OR 0x000002
/* Binary 'xor' operation */
#define SLJIT_SIMD_OP2_XOR 0x000003
/* Shuffle bytes of src1 using the indicies in src2 */
#define SLJIT_SIMD_OP2_SHUFFLE 0x000004

/* Perform simd operations using simd registers.
If the operation is not supported, it returns with
SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
it does not emit any instructions.
type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_OP2_
options except SLJIT_SIMD_LOAD and SLJIT_SIMD_STORE
type must be a combination of SLJIT_SIMD_*, SLJIT_SIMD_MEM_*
and SLJIT_SIMD_OP2_* options except SLJIT_SIMD_LOAD
and SLJIT_SIMD_STORE
dst_freg is the destination register of the operation
src1_freg is the first source register of the operation
src1_freg is the second source register of the operation
src2 is the second source operand of the operation
Flags: - (does not modify flags) */

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg);
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w);

/* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair
can perform an atomic read-modify-write operation. First, an unsigned
Expand Down
48 changes: 43 additions & 5 deletions sljit_src/sljitNativeARM_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1)
#define VST1_s 0xf4800000
#define VSTR_F32 0xed000a00
#define VSUB_F32 0xee300a40
#define VTBL 0xf3b00800

#if (defined SLJIT_CONFIG_ARM_V7 && SLJIT_CONFIG_ARM_V7)
/* Arm v7 specific instructions. */
Expand Down Expand Up @@ -4468,14 +4469,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w)
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
sljit_ins ins = 0;
sljit_s32 alignment;
sljit_ins ins = 0, load_ins;

CHECK_ERROR();
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w));
ADJUST_LOCAL_OFFSET(src2, src2w);

if (reg_size != 3 && reg_size != 4)
return SLJIT_ERR_UNSUPPORTED;
Expand All @@ -4493,19 +4496,54 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co
case SLJIT_SIMD_OP2_XOR:
ins = VEOR;
break;
case SLJIT_SIMD_OP2_SHUFFLE:
ins = VTBL;
break;
}

if (type & SLJIT_SIMD_TEST)
return SLJIT_SUCCESS;

if (src2 & SLJIT_MEM) {
if (elem_size > 3)
elem_size = 3;

load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8));
alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);

SLJIT_ASSERT(reg_size >= alignment);

if (alignment == 3)
load_ins |= 0x10;
else if (alignment >= 4)
load_ins |= 0x20;

FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w));
FAIL_IF(push_inst(compiler, load_ins | VD(TMP_FREG2) | RN(src2) | ((sljit_ins)elem_size) << 6 | 0xf));
src2 = TMP_FREG2;
}

if (reg_size == 4) {
dst_freg = simd_get_quad_reg_index(dst_freg);
src1_freg = simd_get_quad_reg_index(src1_freg);
src2_freg = simd_get_quad_reg_index(src2_freg);
src2 = simd_get_quad_reg_index(src2);

if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) {
ins |= (sljit_ins)1 << 8;

FAIL_IF(push_inst(compiler, ins | VD(dst_freg != src1_freg ? dst_freg : TMP_FREG2) | VN(src1_freg) | VM(src2)));
src2 += SLJIT_QUAD_OTHER_HALF(src2);
FAIL_IF(push_inst(compiler, ins | VD(dst_freg + SLJIT_QUAD_OTHER_HALF(dst_freg)) | VN(src1_freg) | VM(src2)));

if (dst_freg == src1_freg)
return push_inst(compiler, VORR | VD(dst_freg) | VN(TMP_FREG2) | VM(TMP_FREG2));
return SLJIT_SUCCESS;
}

ins |= (sljit_ins)1 << 6;
}

return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg));
return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2));
}

#undef FPU_LOAD
Expand Down
20 changes: 17 additions & 3 deletions sljit_src/sljitNativeARM_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
#define SUBI 0xd1000000
#define SUBS 0xeb000000
#define TBZ 0x36000000
#define TBL_v 0x0e000000
#define UBFM 0xd3400000
#define UCVTF 0x9e630000
#define UDIV 0x9ac00800
Expand Down Expand Up @@ -3224,14 +3225,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w)
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
sljit_ins ins = 0;

CHECK_ERROR();
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w));
ADJUST_LOCAL_OFFSET(src2, src2w);

if (reg_size != 3 && reg_size != 4)
return SLJIT_ERR_UNSUPPORTED;
Expand All @@ -3249,15 +3251,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co
case SLJIT_SIMD_OP2_XOR:
ins = EOR_v;
break;
case SLJIT_SIMD_OP2_SHUFFLE:
ins = TBL_v;
break;
}

if (type & SLJIT_SIMD_TEST)
return SLJIT_SUCCESS;

if (src2 & SLJIT_MEM) {
if (elem_size > 3)
elem_size = 3;

FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w));
push_inst(compiler, LD1 | (reg_size == 4 ? (1 << 30) : 0) | ((sljit_ins)elem_size << 10) | RN(src2) | VT(TMP_FREG1));
src2 = TMP_FREG1;
}

if (reg_size == 4)
ins |= (sljit_ins)1 << 30;

return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg));
return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2));
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
Expand Down
48 changes: 43 additions & 5 deletions sljit_src/sljitNativeARM_T2_32.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1)
#define VST1_s 0xf9800000
#define VSTR_F32 0xed000a00
#define VSUB_F32 0xee300a40
#define VTBL 0xffb00800

#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)

Expand Down Expand Up @@ -4143,14 +4144,16 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
}

SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2, sljit_sw src2w)
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
sljit_ins ins = 0;
sljit_s32 alignment;
sljit_ins ins = 0, load_ins;

CHECK_ERROR();
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2, src2w));
ADJUST_LOCAL_OFFSET(src2, src2w);

if (reg_size != 3 && reg_size != 4)
return SLJIT_ERR_UNSUPPORTED;
Expand All @@ -4168,19 +4171,54 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *co
case SLJIT_SIMD_OP2_XOR:
ins = VEOR;
break;
case SLJIT_SIMD_OP2_SHUFFLE:
ins = VTBL;
break;
}

if (type & SLJIT_SIMD_TEST)
return SLJIT_SUCCESS;

if (src2 & SLJIT_MEM) {
if (elem_size > 3)
elem_size = 3;

load_ins = VLD1 | (sljit_ins)((reg_size == 3) ? (0x7 << 8) : (0xa << 8));
alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);

SLJIT_ASSERT(reg_size >= alignment);

if (alignment == 3)
load_ins |= 0x10;
else if (alignment >= 4)
load_ins |= 0x20;

FAIL_IF(sljit_emit_simd_mem_offset(compiler, &src2, src2w));
FAIL_IF(push_inst32(compiler, load_ins | VD4(TMP_FREG2) | RN4(src2) | ((sljit_ins)elem_size) << 6 | 0xf));
src2 = TMP_FREG2;
}

if (reg_size == 4) {
dst_freg = simd_get_quad_reg_index(dst_freg);
src1_freg = simd_get_quad_reg_index(src1_freg);
src2_freg = simd_get_quad_reg_index(src2_freg);
src2 = simd_get_quad_reg_index(src2);

if (SLJIT_SIMD_GET_OPCODE(type) == SLJIT_SIMD_OP2_SHUFFLE) {
ins |= (sljit_ins)1 << 8;

FAIL_IF(push_inst32(compiler, ins | VD4(dst_freg != src1_freg ? dst_freg : TMP_FREG2) | VN4(src1_freg) | VM4(src2)));
src2 += SLJIT_QUAD_OTHER_HALF(src2);
FAIL_IF(push_inst32(compiler, ins | VD4(dst_freg + SLJIT_QUAD_OTHER_HALF(dst_freg)) | VN4(src1_freg) | VM4(src2)));

if (dst_freg == src1_freg)
return push_inst32(compiler, VORR | VD4(dst_freg) | VN4(TMP_FREG2) | VM4(TMP_FREG2));
return SLJIT_SUCCESS;
}

ins |= (sljit_ins)1 << 6;
}

return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2_freg));
return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2));
}

#undef FPU_LOAD
Expand Down
Loading

0 comments on commit b3f4929

Please sign in to comment.