Skip to content

Commit

Permalink
Merge pull request #7054 from Akira1Saitoh/aarch64VectorMmAllTrue
Browse files Browse the repository at this point in the history
AArch64: Implement mmAnyTrue and mmAllTrue evaluators
  • Loading branch information
knn-k committed Jul 5, 2023
2 parents d94b972 + 3b0ddce commit a213f3f
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
2 changes: 2 additions & 0 deletions compiler/aarch64/codegen/OMRCodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,8 @@ bool OMR::ARM64::CodeGenerator::getSupportsOpCodeForAutoSIMD(TR::CPU *cpu, TR::I
case TR::vmbitswap:
case TR::vbyteswap:
case TR::vmbyteswap:
case TR::mmAllTrue:
case TR::mmAnyTrue:
// Float/ Double are not supported
return (et == TR::Int8 || et == TR::Int16 || et == TR::Int32 || et == TR::Int64);
case TR::vload:
Expand Down
64 changes: 62 additions & 2 deletions compiler/aarch64/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1232,13 +1232,73 @@ OMR::ARM64::TreeEvaluator::mAllTrueEvaluator(TR::Node *node, TR::CodeGenerator *
TR::Register*
OMR::ARM64::TreeEvaluator::mmAnyTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
TR::Node *firstChild = node->getFirstChild();
TR::Node *secondChild = node->getSecondChild();
TR_ASSERT_FATAL_WITH_NODE(node, firstChild->getDataType().getVectorLength() == TR::VectorLength128,
"Only 128-bit vectors are supported %s", firstChild->getDataType().toString());

TR::Register *maskReg = cg->evaluate(firstChild);
TR::Register *mask2Reg = cg->evaluate(secondChild);
TR::Register *resultReg = cg->allocateRegister(TR_GPR);
TR::Register *tempReg = cg->allocateRegister(TR_VRF);

/*
* and v2.16b, v0.16b, v1.16b
* ; umaxp is fast if arrangement specifier is 4s.
* umaxp v2.4s, v2.4s, v2.4s
* ; now relevant data is in lower 64bit of v2.
* umov x0, v2.2d[0]
* cmp x0, #0
* cset x0, ne
*/
generateTrg1Src2Instruction(cg, TR::InstOpCode::vand16b, node, tempReg, maskReg, mask2Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vumaxp4s, node, tempReg, tempReg, tempReg);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resultReg, tempReg, 0);
generateCompareImmInstruction(cg, node, resultReg, 0, true);
generateCSetInstruction(cg, node, resultReg, TR::CC_NE);

cg->stopUsingRegister(tempReg);
node->setRegister(resultReg);
cg->decReferenceCount(firstChild);
cg->decReferenceCount(secondChild);

return resultReg;
}

TR::Register*
OMR::ARM64::TreeEvaluator::mmAllTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg)
{
return TR::TreeEvaluator::unImpOpEvaluator(node, cg);
TR::Node *firstChild = node->getFirstChild();
TR::Node *secondChild = node->getSecondChild();
TR_ASSERT_FATAL_WITH_NODE(node, firstChild->getDataType().getVectorLength() == TR::VectorLength128,
"Only 128-bit vectors are supported %s", firstChild->getDataType().toString());

TR::Register *maskReg = cg->evaluate(firstChild);
TR::Register *mask2Reg = cg->evaluate(secondChild);
TR::Register *resultReg = cg->allocateRegister(TR_GPR);
TR::Register *tempReg = cg->allocateRegister(TR_VRF);

/*
* and v2.16b, v0.16b, v1.16b
* ; uminp is fast if arrangement specifier is 4s.
* uminp v2.4s, v2.4s, v2.4s
* ; now relevant data is in lower 64bit of v2.
* umov x0, v2.2d[0]
* cmn x0, #1
* cset x0, eq
*/
generateTrg1Src2Instruction(cg, TR::InstOpCode::vand16b, node, tempReg, maskReg, mask2Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vuminp4s, node, tempReg, tempReg, tempReg);
generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resultReg, tempReg, 0);
generateCompareImmInstruction(cg, node, resultReg, -1, true);
generateCSetInstruction(cg, node, resultReg, TR::CC_EQ);

cg->stopUsingRegister(tempReg);
node->setRegister(resultReg);
cg->decReferenceCount(firstChild);
cg->decReferenceCount(secondChild);

return resultReg;
}

TR::Register*
Expand Down

0 comments on commit a213f3f

Please sign in to comment.