mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-07 14:45:41 +12:00
Implement arm64 LG2
/EX2
This commit is contained in:
parent
dcd64802a3
commit
40e2774b7f
1 changed files with 350 additions and 112 deletions
|
@ -12,16 +12,19 @@ constexpr bool useSafeMUL = true;
|
||||||
|
|
||||||
// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
|
// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
|
||||||
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
|
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
|
||||||
static constexpr QReg scratch1 = Q0;
|
static constexpr QReg src1Vec = Q1;
|
||||||
static constexpr QReg scratch2 = Q1;
|
static constexpr QReg src2Vec = Q2;
|
||||||
static constexpr QReg src1_vec = Q2;
|
static constexpr QReg src3Vec = Q3;
|
||||||
static constexpr QReg src2_vec = Q3;
|
static constexpr QReg scratch1Vec = Q16;
|
||||||
static constexpr QReg src3_vec = Q4;
|
static constexpr QReg scratch2Vec = Q17;
|
||||||
static constexpr QReg onesVector = Q5;
|
static constexpr QReg scratch3Vec = Q18;
|
||||||
|
static constexpr QReg onesVector = Q31;
|
||||||
|
|
||||||
static constexpr XReg arg1 = X0;
|
static constexpr XReg arg1 = X0;
|
||||||
static constexpr XReg arg2 = X1;
|
static constexpr XReg arg2 = X1;
|
||||||
static constexpr XReg statePointer = X9;
|
static constexpr XReg scratch1 = X9;
|
||||||
|
static constexpr XReg scratch2 = X10;
|
||||||
|
static constexpr XReg statePointer = X15;
|
||||||
|
|
||||||
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||||
oaknut::CodeBlock::unprotect(); // Unprotect the memory before writing to it
|
oaknut::CodeBlock::unprotect(); // Unprotect the memory before writing to it
|
||||||
|
@ -62,8 +65,12 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||||
// Scan the code for call, exp2, log2, etc instructions which need some special care
|
// Scan the code for call, exp2, log2, etc instructions which need some special care
|
||||||
// After that, emit exp2 and log2 functions if the corresponding instructions are present
|
// After that, emit exp2 and log2 functions if the corresponding instructions are present
|
||||||
scanCode(shaderUnit);
|
scanCode(shaderUnit);
|
||||||
if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2");
|
if (codeHasExp2) {
|
||||||
if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2");
|
exp2Func = emitExp2Func();
|
||||||
|
}
|
||||||
|
if (codeHasLog2) {
|
||||||
|
log2Func = emitLog2Func();
|
||||||
|
}
|
||||||
|
|
||||||
align(16);
|
align(16);
|
||||||
// Compile every instruction in the shader
|
// Compile every instruction in the shader
|
||||||
|
@ -140,13 +147,13 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
||||||
// case ShaderOpcodes::DPH:
|
// case ShaderOpcodes::DPH:
|
||||||
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
|
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
||||||
// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
|
case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
|
case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
|
case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
|
case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
|
case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
|
case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
|
||||||
// case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break;
|
case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
|
case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
||||||
case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
|
case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
|
||||||
|
@ -221,7 +228,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u
|
||||||
u32 compSwizzle; // Component swizzle pattern for the register
|
u32 compSwizzle; // Component swizzle pattern for the register
|
||||||
bool negate; // If true, negate all lanes of the register
|
bool negate; // If true, negate all lanes of the register
|
||||||
|
|
||||||
if constexpr (sourceIndex == 1) { // SRC1
|
if constexpr (sourceIndex == 1) { // src1Vec
|
||||||
negate = (getBit<4>(operandDescriptor)) != 0;
|
negate = (getBit<4>(operandDescriptor)) != 0;
|
||||||
compSwizzle = getBits<5, 8>(operandDescriptor);
|
compSwizzle = getBits<5, 8>(operandDescriptor);
|
||||||
} else if constexpr (sourceIndex == 2) { // SRC2
|
} else if constexpr (sourceIndex == 2) { // SRC2
|
||||||
|
@ -252,7 +259,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u
|
||||||
|
|
||||||
// Some of these cases may still be optimizable
|
// Some of these cases may still be optimizable
|
||||||
default: {
|
default: {
|
||||||
MOV(scratch1.B16(), dest.B16()); // Make a copy of the register
|
MOV(scratch1Vec.B16(), dest.B16()); // Make a copy of the register
|
||||||
|
|
||||||
const auto newX = getBits<6, 2>(compSwizzle);
|
const auto newX = getBits<6, 2>(compSwizzle);
|
||||||
const auto newY = getBits<4, 2>(compSwizzle);
|
const auto newY = getBits<4, 2>(compSwizzle);
|
||||||
|
@ -262,19 +269,19 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u
|
||||||
// If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov
|
// If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov
|
||||||
// Repeat for each component of the vector
|
// Repeat for each component of the vector
|
||||||
if (newX != 0) {
|
if (newX != 0) {
|
||||||
MOV(dest.Selem()[0], scratch1.Selem()[newX]);
|
MOV(dest.Selem()[0], scratch1Vec.Selem()[newX]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newY != 1) {
|
if (newY != 1) {
|
||||||
MOV(dest.Selem()[1], scratch1.Selem()[newY]);
|
MOV(dest.Selem()[1], scratch1Vec.Selem()[newY]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newZ != 2) {
|
if (newZ != 2) {
|
||||||
MOV(dest.Selem()[2], scratch1.Selem()[newZ]);
|
MOV(dest.Selem()[2], scratch1Vec.Selem()[newZ]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newW != 3) {
|
if (newW != 3) {
|
||||||
MOV(dest.Selem()[3], scratch1.Selem()[newW]);
|
MOV(dest.Selem()[3], scratch1Vec.Selem()[newW]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -326,7 +333,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u
|
||||||
|
|
||||||
// Some of these cases may still be optimizable
|
// Some of these cases may still be optimizable
|
||||||
default: {
|
default: {
|
||||||
MOV(scratch1.B16(), dest.B16()); // Make a copy of the register
|
MOV(scratch1Vec.B16(), dest.B16()); // Make a copy of the register
|
||||||
|
|
||||||
const auto newX = getBits<6, 2>(compSwizzle);
|
const auto newX = getBits<6, 2>(compSwizzle);
|
||||||
const auto newY = getBits<4, 2>(compSwizzle);
|
const auto newY = getBits<4, 2>(compSwizzle);
|
||||||
|
@ -336,19 +343,19 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u
|
||||||
// If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov
|
// If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov
|
||||||
// Repeat for each component of the vector
|
// Repeat for each component of the vector
|
||||||
if (newX != 0) {
|
if (newX != 0) {
|
||||||
MOV(dest.Selem()[0], scratch1.Selem()[newX]);
|
MOV(dest.Selem()[0], scratch1Vec.Selem()[newX]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newY != 1) {
|
if (newY != 1) {
|
||||||
MOV(dest.Selem()[1], scratch1.Selem()[newY]);
|
MOV(dest.Selem()[1], scratch1Vec.Selem()[newY]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newZ != 2) {
|
if (newZ != 2) {
|
||||||
MOV(dest.Selem()[2], scratch1.Selem()[newZ]);
|
MOV(dest.Selem()[2], scratch1Vec.Selem()[newZ]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (newW != 3) {
|
if (newW != 3) {
|
||||||
MOV(dest.Selem()[3], scratch1.Selem()[newW]);
|
MOV(dest.Selem()[3], scratch1Vec.Selem()[newW]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -411,11 +418,11 @@ void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 des
|
||||||
STR(source, statePointer, offset);
|
STR(source, statePointer, offset);
|
||||||
} else {
|
} else {
|
||||||
u8* blendMaskPointer = getLabelPointer<u8*>(blendMasks);
|
u8* blendMaskPointer = getLabelPointer<u8*>(blendMasks);
|
||||||
LDR(scratch1, statePointer, offset); // Load current value
|
LDR(scratch1Vec, statePointer, offset); // Load current value
|
||||||
LDR(scratch2, blendMaskPointer + writeMask * 16); // Load write mask for blending
|
LDR(scratch2Vec, blendMaskPointer + writeMask * 16); // Load write mask for blending
|
||||||
|
|
||||||
BSL(scratch2.B16(), source.B16(), scratch1.B16()); // Scratch2 = (Source & mask) | (original & ~mask)
|
BSL(scratch2Vec.B16(), source.B16(), scratch1Vec.B16()); // Scratch2 = (Source & mask) | (original & ~mask)
|
||||||
STR(scratch2, statePointer, offset); // Write it back
|
STR(scratch2Vec, statePointer, offset); // Write it back
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -425,8 +432,8 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -435,9 +442,9 @@ void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
FRINTM(src1_vec.S4(), src1_vec.S4()); // Floor it and store into dest
|
FRINTM(src1Vec.S4(), src1Vec.S4()); // Floor it and store into dest
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -455,16 +462,16 @@ void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) {
|
||||||
// If no register is being written to then it is a nop. Probably not common but whatever
|
// If no register is being written to then it is a nop. Probably not common but whatever
|
||||||
if (!writeX && !writeY) return;
|
if (!writeX && !writeY) return;
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor);
|
||||||
FCVTZS(src1_vec.S4(), src1_vec.S4()); // Convert src1 from floats to s32s with truncation
|
FCVTZS(src1Vec.S4(), src1Vec.S4()); // Convert src1 from floats to s32s with truncation
|
||||||
|
|
||||||
// Write both together
|
// Write both together
|
||||||
if (writeX && writeY) {
|
if (writeX && writeY) {
|
||||||
STR(src1_vec.toD(), statePointer, addrRegisterOffset);
|
STR(src1Vec.toD(), statePointer, addrRegisterOffset);
|
||||||
} else if (writeX) {
|
} else if (writeX) {
|
||||||
STR(src1_vec.toS(), statePointer, addrRegisterOffset);
|
STR(src1Vec.toS(), statePointer, addrRegisterOffset);
|
||||||
} else if (writeY) {
|
} else if (writeY) {
|
||||||
MOV(W0, src1_vec.Selem()[1]); // W0 = Y component
|
MOV(W0, src1Vec.Selem()[1]); // W0 = Y component
|
||||||
STR(W0, statePointer, addrRegisterYOffset);
|
STR(W0, statePointer, addrRegisterYOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -477,26 +484,26 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
|
// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
|
||||||
INS(src1_vec.Selem()[3], WZR);
|
INS(src1Vec.Selem()[3], WZR);
|
||||||
|
|
||||||
// Now do a full DP4
|
// Now do a full DP4
|
||||||
// Do a piecewise multiplication of the vectors first
|
// Do a piecewise multiplication of the vectors first
|
||||||
if constexpr (useSafeMUL) {
|
if constexpr (useSafeMUL) {
|
||||||
emitSafeMUL(src1_vec, src2_vec, scratch1);
|
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
|
||||||
} else {
|
} else {
|
||||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
}
|
}
|
||||||
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
|
FADDP(src1Vec.S4(), src1Vec.S4(), src1Vec.S4()); // Now add the adjacent components together
|
||||||
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
FADDP(src1Vec.toS(), src1Vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
||||||
|
|
||||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx
|
||||||
}
|
}
|
||||||
|
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -507,23 +514,228 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
|
|
||||||
// Do a piecewise multiplication of the vectors first
|
// Do a piecewise multiplication of the vectors first
|
||||||
if constexpr (useSafeMUL) {
|
if constexpr (useSafeMUL) {
|
||||||
emitSafeMUL(src1_vec, src2_vec, scratch1);
|
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
|
||||||
} else {
|
} else {
|
||||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
}
|
}
|
||||||
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
|
FADDP(src1Vec.S4(), src1Vec.S4(), src1Vec.S4()); // Now add the adjacent components together
|
||||||
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
FADDP(src1Vec.toS(), src1Vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
||||||
|
|
||||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx
|
||||||
}
|
}
|
||||||
|
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
oaknut::Label ShaderEmitter::emitLog2Func() {
|
||||||
|
oaknut::Label funcStart;
|
||||||
|
|
||||||
|
// We perform this approximation by first performing a range reduction into the range
|
||||||
|
// [1.0, 2.0). A minimax polynomial which was fit for the function log2(x) / (x - 1) is then
|
||||||
|
// evaluated. We multiply the result by (x - 1) then restore the result into the appropriate
|
||||||
|
// range. Coefficients for the minimax polynomial.
|
||||||
|
// f(x) computes approximately log2(x) / (x - 1).
|
||||||
|
// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
|
||||||
|
oaknut::Label c0;
|
||||||
|
l(c0);
|
||||||
|
dw(0x3d74552f);
|
||||||
|
|
||||||
|
oaknut::Label c14;
|
||||||
|
l(c14);
|
||||||
|
dw(0xbeee7397);
|
||||||
|
dw(0x3fbd96dd);
|
||||||
|
dw(0xc02153f6);
|
||||||
|
dw(0x4038d96c);
|
||||||
|
|
||||||
|
oaknut::Label negativeInfinityVec;
|
||||||
|
l(negativeInfinityVec);
|
||||||
|
dw(0xff800000);
|
||||||
|
dw(0xff800000);
|
||||||
|
dw(0xff800000);
|
||||||
|
dw(0xff800000);
|
||||||
|
|
||||||
|
oaknut::Label defaultQnanVec;
|
||||||
|
l(defaultQnanVec);
|
||||||
|
dw(0x7fc00000);
|
||||||
|
dw(0x7fc00000);
|
||||||
|
dw(0x7fc00000);
|
||||||
|
dw(0x7fc00000);
|
||||||
|
|
||||||
|
oaknut::Label exit;
|
||||||
|
oaknut::Label inputIsZero;
|
||||||
|
oaknut::Label inputOutOfRange;
|
||||||
|
|
||||||
|
l(inputOutOfRange);
|
||||||
|
B(Cond::EQ, inputIsZero);
|
||||||
|
ADR(scratch1, defaultQnanVec);
|
||||||
|
LDR(src1Vec, scratch1);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
l(inputIsZero);
|
||||||
|
ADR(scratch1, negativeInfinityVec);
|
||||||
|
LDR(src1Vec, scratch1);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
l(funcStart);
|
||||||
|
|
||||||
|
// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}
|
||||||
|
// Ordinal(n) ? 0xFFFFFFFF : 0x0
|
||||||
|
FCMEQ(scratch1Vec.toS(), src1Vec.toS(), src1Vec.toS());
|
||||||
|
MOV(scratch1.toW(), scratch1Vec.Selem()[0]);
|
||||||
|
|
||||||
|
// src1Vec == NaN
|
||||||
|
CMP(scratch1.toW(), 0);
|
||||||
|
B(Cond::EQ, exit);
|
||||||
|
|
||||||
|
// (0.0 >= n) ? 0xFFFFFFFF : 0x0
|
||||||
|
MOV(scratch1.toW(), src1Vec.Selem()[0]);
|
||||||
|
|
||||||
|
// src1Vec <= 0.0
|
||||||
|
CMP(scratch1.toW(), 0);
|
||||||
|
B(Cond::LE, inputOutOfRange);
|
||||||
|
|
||||||
|
// Split input:
|
||||||
|
// src1Vec = MANT[1,2)
|
||||||
|
// scratch2Vec = Exponent
|
||||||
|
MOV(scratch1.toW(), src1Vec.Selem()[0]);
|
||||||
|
MOV(scratch2.toW(), scratch1.toW());
|
||||||
|
AND(scratch2.toW(), scratch2.toW(), 0x007fffff);
|
||||||
|
ORR(scratch2.toW(), scratch2.toW(), 0x3f800000);
|
||||||
|
MOV(src1Vec.Selem()[0], scratch2.toW());
|
||||||
|
// src1Vec now contains the mantissa of the input
|
||||||
|
UBFX(scratch1.toW(), scratch1.toW(), 23, 8);
|
||||||
|
SUB(scratch1.toW(), scratch1.toW(), 0x7F);
|
||||||
|
MOV(scratch2Vec.Selem()[0], scratch1.toW());
|
||||||
|
UCVTF(scratch2Vec.toS(), scratch2Vec.toS());
|
||||||
|
// scratch2Vec now contains the exponent of the input
|
||||||
|
|
||||||
|
ADR(scratch1, c0);
|
||||||
|
LDR(scratch1.toW(), scratch1);
|
||||||
|
MOV(scratch1Vec.Selem()[0], scratch1.toW());
|
||||||
|
|
||||||
|
// Complete computation of polynomial
|
||||||
|
// Load C1, C2, C3, C4 into a single scratch register
|
||||||
|
const QReg C14 = src2Vec;
|
||||||
|
ADR(scratch1, c14);
|
||||||
|
LDR(C14, scratch1);
|
||||||
|
FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS());
|
||||||
|
FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[0]);
|
||||||
|
FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS());
|
||||||
|
FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[1]);
|
||||||
|
FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS());
|
||||||
|
FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[2]);
|
||||||
|
FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS());
|
||||||
|
|
||||||
|
FSUB(src1Vec.toS(), src1Vec.toS(), onesVector.toS());
|
||||||
|
FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[3]);
|
||||||
|
|
||||||
|
FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS());
|
||||||
|
FADD(scratch2Vec.toS(), scratch1Vec.toS(), scratch2Vec.toS());
|
||||||
|
|
||||||
|
// Duplicate result across vector
|
||||||
|
MOV(src1Vec.Selem()[0], scratch2Vec.Selem()[0]);
|
||||||
|
l(exit);
|
||||||
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
return funcStart;
|
||||||
|
}
|
||||||
|
|
||||||
|
oaknut::Label ShaderEmitter::emitExp2Func() {
|
||||||
|
oaknut::Label funcStart;
|
||||||
|
|
||||||
|
// This performs a range reduction into the range [-0.5, 0.5)
|
||||||
|
// A minmax polynomial which was fit for the function exp2(x) is then evaluated
|
||||||
|
// Then restore the result into the appropriate range
|
||||||
|
|
||||||
|
oaknut::Label inputMax;
|
||||||
|
l(inputMax);
|
||||||
|
dw(0x43010000);
|
||||||
|
oaknut::Label inputMin;
|
||||||
|
l(inputMin);
|
||||||
|
dw(0xc2fdffff);
|
||||||
|
oaknut::Label half;
|
||||||
|
l(half);
|
||||||
|
dw(0x3f000000);
|
||||||
|
oaknut::Label c0;
|
||||||
|
l(c0);
|
||||||
|
dw(0x3c5dbe69);
|
||||||
|
dw(0x3d5509f9);
|
||||||
|
dw(0x3e773cc5);
|
||||||
|
dw(0x3f3168b3);
|
||||||
|
dw(0x3f800016);
|
||||||
|
|
||||||
|
oaknut::Label exit;
|
||||||
|
|
||||||
|
l(funcStart);
|
||||||
|
|
||||||
|
FCMP(src1Vec.toS(), src1Vec.toS());
|
||||||
|
// Branch if NaN
|
||||||
|
B(Cond::NE, exit);
|
||||||
|
|
||||||
|
// Decompose input:
|
||||||
|
// scratch1Vec = 2^round(input)
|
||||||
|
// src1Vec = input-round(input) [-0.5, 0.5)
|
||||||
|
// Clamp to maximum range since we shift the value directly into the exponent
|
||||||
|
ADR(scratch1, inputMax);
|
||||||
|
LDR(scratch1Vec.toS(), scratch1, 0);
|
||||||
|
FMIN(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS());
|
||||||
|
|
||||||
|
LDR(scratch1Vec.toS(), scratch1, 4);
|
||||||
|
FMAX(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS());
|
||||||
|
|
||||||
|
ADR(scratch1, half);
|
||||||
|
LDR(scratch1Vec.toS(), scratch1);
|
||||||
|
FSUB(scratch1Vec.toS(), src1Vec.toS(), scratch1Vec.toS());
|
||||||
|
|
||||||
|
FCVTNS(scratch1Vec.toS(), scratch1Vec.toS());
|
||||||
|
MOV(scratch1.toW(), scratch1Vec.Selem()[0]);
|
||||||
|
SCVTF(scratch1Vec.toS(), scratch1.toW());
|
||||||
|
|
||||||
|
// scratch1Vec now contains input rounded to the nearest integer
|
||||||
|
ADD(scratch1.toW(), scratch1.toW(), 0x7F);
|
||||||
|
FSUB(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS());
|
||||||
|
// src1Vec contains input - round(input), which is in [-0.5, 0.5)
|
||||||
|
LSL(scratch1.toW(), scratch1.toW(), 23);
|
||||||
|
MOV(scratch1Vec.Selem()[0], scratch1.toW());
|
||||||
|
// scratch1Vec contains 2^(round(input))
|
||||||
|
|
||||||
|
// Complete computation of polynomial
|
||||||
|
ADR(scratch2, c0);
|
||||||
|
LDR(scratch2Vec.toS(), scratch2, 0);
|
||||||
|
FMUL(scratch2Vec.toS(), src1Vec.toS(), scratch2Vec.toS());
|
||||||
|
|
||||||
|
LDR(scratch3Vec.toS(), scratch2, 4);
|
||||||
|
FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS());
|
||||||
|
FMUL(scratch2Vec.toS(), scratch2Vec.toS(), src1Vec.toS());
|
||||||
|
|
||||||
|
LDR(scratch3Vec.toS(), scratch2, 8);
|
||||||
|
FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS());
|
||||||
|
FMUL(scratch2Vec.toS(), scratch2Vec.toS(), src1Vec.toS());
|
||||||
|
|
||||||
|
LDR(scratch3Vec.toS(), scratch2, 12);
|
||||||
|
FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS());
|
||||||
|
FMUL(src1Vec.toS(), scratch2Vec.toS(), src1Vec.toS());
|
||||||
|
|
||||||
|
LDR(scratch3Vec.toS(), scratch2, 16);
|
||||||
|
FADD(src1Vec.toS(), scratch3Vec.toS(), src1Vec.toS());
|
||||||
|
|
||||||
|
FMUL(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS());
|
||||||
|
|
||||||
|
// Duplicate result across vector
|
||||||
|
l(exit);
|
||||||
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
return funcStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0) {
|
void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0) {
|
||||||
|
@ -534,10 +746,10 @@ void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QR
|
||||||
// Both a FMUL and FMULX are done and the results are compared to each other
|
// Both a FMUL and FMULX are done and the results are compared to each other
|
||||||
// In the case that the results are diferent(a 0.0*inf happened), then
|
// In the case that the results are diferent(a 0.0*inf happened), then
|
||||||
// a 0.0 is written
|
// a 0.0 is written
|
||||||
FMULX(scratch1.S4(), src1.S4(), src2.S4());
|
FMULX(scratch1Vec.S4(), src1.S4(), src2.S4());
|
||||||
FMUL(src1.S4(), src1.S4(), src2.S4());
|
FMUL(src1.S4(), src1.S4(), src2.S4());
|
||||||
CMEQ(scratch1.S4(), scratch1.S4(), src1.S4());
|
CMEQ(scratch1Vec.S4(), scratch1Vec.S4(), src1.S4());
|
||||||
AND(src1.B16(), src1.B16(), scratch1.B16());
|
AND(src1.B16(), src1.B16(), scratch1Vec.B16());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -547,10 +759,10 @@ void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
FADD(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FADD(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -560,10 +772,10 @@ void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
FMAX(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMAX(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -573,10 +785,10 @@ void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
FMIN(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMIN(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -586,16 +798,16 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
|
|
||||||
if constexpr (useSafeMUL) {
|
if constexpr (useSafeMUL) {
|
||||||
emitSafeMUL(src1_vec, src2_vec, scratch1);
|
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
|
||||||
} else {
|
} else {
|
||||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
}
|
}
|
||||||
|
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -605,16 +817,16 @@ void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
const u32 writeMask = operandDescriptor & 0xf;
|
const u32 writeMask = operandDescriptor & 0xf;
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
FDIV(src1_vec.toS(), onesVector.toS(), src1_vec.toS()); // src1 = 1.0 / src1
|
FDIV(src1Vec.toS(), onesVector.toS(), src1Vec.toS()); // src1 = 1.0 / src1
|
||||||
|
|
||||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||||
// Otherwise we do
|
// Otherwise we do
|
||||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx
|
||||||
}
|
}
|
||||||
|
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -625,7 +837,7 @@ void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 writeMask = operandDescriptor & 0xf;
|
const u32 writeMask = operandDescriptor & 0xf;
|
||||||
constexpr bool useAccurateRSQ = true;
|
constexpr bool useAccurateRSQ = true;
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
|
|
||||||
// Compute reciprocal square root approximation
|
// Compute reciprocal square root approximation
|
||||||
// TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate
|
// TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate
|
||||||
|
@ -633,19 +845,19 @@ void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
|
||||||
// It doesn't have regular sqrt/div instructions.
|
// It doesn't have regular sqrt/div instructions.
|
||||||
// For now, we default to accurate inverse square root
|
// For now, we default to accurate inverse square root
|
||||||
if constexpr (useAccurateRSQ) {
|
if constexpr (useAccurateRSQ) {
|
||||||
FSQRT(src1_vec.toS(), src1_vec.toS()); // src1 = sqrt(src1), scalar
|
FSQRT(src1Vec.toS(), src1Vec.toS()); // src1 = sqrt(src1), scalar
|
||||||
FDIV(src1_vec.toS(), onesVector.toS(), src1_vec.toS()); // Now invert src1
|
FDIV(src1Vec.toS(), onesVector.toS(), src1Vec.toS()); // Now invert src1
|
||||||
} else {
|
} else {
|
||||||
FRSQRTE(src1_vec.toS(), src1_vec.toS()); // Much nicer
|
FRSQRTE(src1Vec.toS(), src1Vec.toS()); // Much nicer
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||||
// Otherwise we do
|
// Otherwise we do
|
||||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx
|
||||||
}
|
}
|
||||||
|
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -658,17 +870,17 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<22, 2>(instruction);
|
const u32 idx = getBits<22, 2>(instruction);
|
||||||
const u32 dest = getBits<24, 5>(instruction);
|
const u32 dest = getBits<24, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, 0, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, 0, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
||||||
loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
loadRegister<3>(src3Vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
||||||
|
|
||||||
if constexpr (useSafeMUL) {
|
if constexpr (useSafeMUL) {
|
||||||
emitSafeMUL(src1_vec, src2_vec, scratch1);
|
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
|
||||||
FADD(src3_vec.S4(), src3_vec.S4(), src1_vec.S4());
|
FADD(src3Vec.S4(), src3Vec.S4(), src1Vec.S4());
|
||||||
} else {
|
} else {
|
||||||
FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FMLA(src3Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
}
|
}
|
||||||
storeRegister(src3_vec, shader, dest, operandDescriptor);
|
storeRegister(src3Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -680,13 +892,13 @@ void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, isSLTI ? 0 : idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, isSLTI ? 0 : idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, isSLTI ? idx : 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, isSLTI ? idx : 0, operandDescriptor);
|
||||||
// Set each lane of SRC1 to FFFFFFFF if src2 > src1, else to 0. NEON does not have FCMLT so we use FCMGT with inverted operands
|
// Set each lane of src1Vec to FFFFFFFF if src2 > src1, else to 0. NEON does not have FCMLT so we use FCMGT with inverted operands
|
||||||
// This is more or less a direct port of the relevant x64 JIT code
|
// This is more or less a direct port of the relevant x64 JIT code
|
||||||
FCMGT(src1_vec.S4(), src2_vec.S4(), src1_vec.S4());
|
FCMGT(src1Vec.S4(), src2Vec.S4(), src1Vec.S4());
|
||||||
AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0
|
AND(src1Vec.B16(), src1Vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -698,13 +910,13 @@ void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, isSGEI ? 0 : idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, isSGEI ? 0 : idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, isSGEI ? idx : 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, isSGEI ? idx : 0, operandDescriptor);
|
||||||
// Set each lane of SRC1 to FFFFFFFF if src1 >= src2, else to 0.
|
// Set each lane of src1Vec to FFFFFFFF if src1 >= src2, else to 0.
|
||||||
// This is more or less a direct port of the relevant x64 JIT code
|
// This is more or less a direct port of the relevant x64 JIT code
|
||||||
FCMGE(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
FCMGE(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
|
||||||
AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0
|
AND(src1Vec.B16(), src1Vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0
|
||||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
|
||||||
|
@ -715,8 +927,8 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 cmpY = getBits<21, 3>(instruction);
|
const u32 cmpY = getBits<21, 3>(instruction);
|
||||||
const u32 cmpX = getBits<24, 3>(instruction);
|
const u32 cmpX = getBits<24, 3>(instruction);
|
||||||
|
|
||||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
|
||||||
|
|
||||||
// Map from PICA condition codes (used as index) to x86 condition codes
|
// Map from PICA condition codes (used as index) to x86 condition codes
|
||||||
// We treat invalid condition codes as "always" as suggested by 3DBrew
|
// We treat invalid condition codes as "always" as suggested by 3DBrew
|
||||||
|
@ -729,13 +941,13 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
|
||||||
const size_t cmpRegXOffset = uintptr_t(&shader.cmpRegister[0]) - uintptr_t(&shader);
|
const size_t cmpRegXOffset = uintptr_t(&shader.cmpRegister[0]) - uintptr_t(&shader);
|
||||||
|
|
||||||
// NEON doesn't have SIMD comparisons to do fun stuff with like on x64
|
// NEON doesn't have SIMD comparisons to do fun stuff with like on x64
|
||||||
FCMP(src1_vec.toS(), src2_vec.toS());
|
FCMP(src1Vec.toS(), src2Vec.toS());
|
||||||
CSET(W0, conditionCodes[cmpX]);
|
CSET(W0, conditionCodes[cmpX]);
|
||||||
|
|
||||||
// Compare Y components, which annoyingly enough can't be done without moving
|
// Compare Y components, which annoyingly enough can't be done without moving
|
||||||
MOV(scratch1.toS(), src1_vec.Selem()[1]);
|
MOV(scratch1Vec.toS(), src1Vec.Selem()[1]);
|
||||||
MOV(scratch2.toS(), src2_vec.Selem()[1]);
|
MOV(scratch2Vec.toS(), src2Vec.Selem()[1]);
|
||||||
FCMP(scratch1.toS(), scratch2.toS());
|
FCMP(scratch1Vec.toS(), scratch2Vec.toS());
|
||||||
CSET(W1, conditionCodes[cmpY]);
|
CSET(W1, conditionCodes[cmpY]);
|
||||||
|
|
||||||
// Merge the booleans and write them back in one STRh
|
// Merge the booleans and write them back in one STRh
|
||||||
|
@ -915,6 +1127,19 @@ void ShaderEmitter::recJMPU(const PICAShader& shader, u32 instruction) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) {
|
||||||
|
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||||
|
const u32 src = getBits<12, 7>(instruction);
|
||||||
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
|
STR(X30, SP, POST_INDEXED, -16);
|
||||||
|
BL(log2Func);
|
||||||
|
LDR(X30, SP, PRE_INDEXED, 16);
|
||||||
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 dest = getBits<10, 12>(instruction);
|
const u32 dest = getBits<10, 12>(instruction);
|
||||||
const u32 uniformIndex = getBits<22, 2>(instruction);
|
const u32 uniformIndex = getBits<22, 2>(instruction);
|
||||||
|
@ -979,4 +1204,17 @@ void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) {
|
||||||
RET();
|
RET();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) {
|
||||||
|
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||||
|
const u32 src = getBits<12, 7>(instruction);
|
||||||
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
|
loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec
|
||||||
|
STR(X30, SP, POST_INDEXED, -16);
|
||||||
|
BL(exp2Func);
|
||||||
|
LDR(X30, SP, PRE_INDEXED, 16);
|
||||||
|
storeRegister(src1Vec, shader, dest, operandDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Reference in a new issue