diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp index 7d8960f9..d6358070 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp @@ -12,16 +12,19 @@ constexpr bool useSafeMUL = true; // Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions // So to avoid pushing and popping, we'll be making use of volatile registers as much as possible -static constexpr QReg scratch1 = Q0; -static constexpr QReg scratch2 = Q1; -static constexpr QReg src1_vec = Q2; -static constexpr QReg src2_vec = Q3; -static constexpr QReg src3_vec = Q4; -static constexpr QReg onesVector = Q5; +static constexpr QReg src1Vec = Q1; +static constexpr QReg src2Vec = Q2; +static constexpr QReg src3Vec = Q3; +static constexpr QReg scratch1Vec = Q16; +static constexpr QReg scratch2Vec = Q17; +static constexpr QReg scratch3Vec = Q18; +static constexpr QReg onesVector = Q31; static constexpr XReg arg1 = X0; static constexpr XReg arg2 = X1; -static constexpr XReg statePointer = X9; +static constexpr XReg scratch1 = X9; +static constexpr XReg scratch2 = X10; +static constexpr XReg statePointer = X15; void ShaderEmitter::compile(const PICAShader& shaderUnit) { oaknut::CodeBlock::unprotect(); // Unprotect the memory before writing to it @@ -62,8 +65,12 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { // Scan the code for call, exp2, log2, etc instructions which need some special care // After that, emit exp2 and log2 functions if the corresponding instructions are present scanCode(shaderUnit); - if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2"); - if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2"); + if (codeHasExp2) { + exp2Func = emitExp2Func(); + } + if (codeHasLog2) { + log2Func = emitLog2Func(); + } align(16); // Compile every instruction in the shader @@ -140,13 +147,13 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { // case ShaderOpcodes::DPH: // case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break; case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; - // case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break; + case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break; case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break; case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break; case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break; case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break; case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break; - // case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break; + case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break; case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break; case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break; @@ -221,7 +228,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u u32 compSwizzle; // Component swizzle pattern for the register bool negate; // If true, negate all lanes of the register - if constexpr (sourceIndex == 1) { // SRC1 + if constexpr (sourceIndex == 1) { // src1Vec negate = (getBit<4>(operandDescriptor)) != 0; compSwizzle = getBits<5, 8>(operandDescriptor); } else if constexpr (sourceIndex == 2) { // SRC2 @@ -252,7 +259,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u // Some of these cases may still be optimizable default: { - MOV(scratch1.B16(), dest.B16()); // Make a copy of the register + MOV(scratch1Vec.B16(), dest.B16()); // Make a copy of the register const auto newX = getBits<6, 2>(compSwizzle); const auto newY = getBits<4, 2>(compSwizzle); @@ -262,19 +269,19 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u // If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov // Repeat for each component of the vector if (newX != 0) { - MOV(dest.Selem()[0], scratch1.Selem()[newX]); + MOV(dest.Selem()[0], scratch1Vec.Selem()[newX]); } if (newY != 1) { - MOV(dest.Selem()[1], scratch1.Selem()[newY]); + MOV(dest.Selem()[1], scratch1Vec.Selem()[newY]); } if (newZ != 2) { - MOV(dest.Selem()[2], scratch1.Selem()[newZ]); + MOV(dest.Selem()[2], scratch1Vec.Selem()[newZ]); } if (newW != 3) { - MOV(dest.Selem()[3], scratch1.Selem()[newW]); + MOV(dest.Selem()[3], scratch1Vec.Selem()[newW]); } break; @@ -326,7 +333,7 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u // Some of these cases may still be optimizable default: { - MOV(scratch1.B16(), dest.B16()); // Make a copy of the register + MOV(scratch1Vec.B16(), dest.B16()); // Make a copy of the register const auto newX = getBits<6, 2>(compSwizzle); const auto newY = getBits<4, 2>(compSwizzle); @@ -336,19 +343,19 @@ void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u // If the lane swizzled into the new x component is NOT the current x component, swizzle the correct lane with a mov // Repeat for each component of the vector if (newX != 0) { - MOV(dest.Selem()[0], scratch1.Selem()[newX]); + MOV(dest.Selem()[0], scratch1Vec.Selem()[newX]); } if (newY != 1) { - MOV(dest.Selem()[1], scratch1.Selem()[newY]); + MOV(dest.Selem()[1], scratch1Vec.Selem()[newY]); } if (newZ != 2) { - MOV(dest.Selem()[2], scratch1.Selem()[newZ]); + MOV(dest.Selem()[2], scratch1Vec.Selem()[newZ]); } if (newW != 3) { - MOV(dest.Selem()[3], scratch1.Selem()[newW]); + MOV(dest.Selem()[3], scratch1Vec.Selem()[newW]); } break; @@ -411,11 +418,11 @@ void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 des STR(source, statePointer, offset); } else { u8* blendMaskPointer = getLabelPointer(blendMasks); - LDR(scratch1, statePointer, offset); // Load current value - LDR(scratch2, blendMaskPointer + writeMask * 16); // Load write mask for blending + LDR(scratch1Vec, statePointer, offset); // Load current value + LDR(scratch2Vec, blendMaskPointer + writeMask * 16); // Load write mask for blending - BSL(scratch2.B16(), source.B16(), scratch1.B16()); // Scratch2 = (Source & mask) | (original & ~mask) - STR(scratch2, statePointer, offset); // Write it back + BSL(scratch2Vec.B16(), source.B16(), scratch1Vec.B16()); // Scratch2 = (Source & mask) | (original & ~mask) + STR(scratch2Vec, statePointer, offset); // Write it back } } @@ -425,8 +432,8 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 - storeRegister(src1_vec, shader, dest, operandDescriptor); + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) { @@ -435,9 +442,9 @@ void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 - FRINTM(src1_vec.S4(), src1_vec.S4()); // Floor it and store into dest - storeRegister(src1_vec, shader, dest, operandDescriptor); + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec + FRINTM(src1Vec.S4(), src1Vec.S4()); // Floor it and store into dest + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) { @@ -455,16 +462,16 @@ void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) { // If no register is being written to then it is a nop. Probably not common but whatever if (!writeX && !writeY) return; - loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); - FCVTZS(src1_vec.S4(), src1_vec.S4()); // Convert src1 from floats to s32s with truncation + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); + FCVTZS(src1Vec.S4(), src1Vec.S4()); // Convert src1 from floats to s32s with truncation // Write both together if (writeX && writeY) { - STR(src1_vec.toD(), statePointer, addrRegisterOffset); + STR(src1Vec.toD(), statePointer, addrRegisterOffset); } else if (writeX) { - STR(src1_vec.toS(), statePointer, addrRegisterOffset); + STR(src1Vec.toS(), statePointer, addrRegisterOffset); } else if (writeY) { - MOV(W0, src1_vec.Selem()[1]); // W0 = Y component + MOV(W0, src1Vec.Selem()[1]); // W0 = Y component STR(W0, statePointer, addrRegisterYOffset); } } @@ -477,26 +484,26 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) { const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = getBits<0, 4>(operandDescriptor); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); // Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3 - INS(src1_vec.Selem()[3], WZR); + INS(src1Vec.Selem()[3], WZR); // Now do a full DP4 // Do a piecewise multiplication of the vectors first if constexpr (useSafeMUL) { - emitSafeMUL(src1_vec, src2_vec, scratch1); + emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { - FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); + FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); } - FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together - FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product + FADDP(src1Vec.S4(), src1Vec.S4(), src1Vec.S4()); // Now add the adjacent components together + FADDP(src1Vec.toS(), src1Vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product - if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x - DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx } - storeRegister(src1_vec, shader, dest, operandDescriptor); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { @@ -507,23 +514,228 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = getBits<0, 4>(operandDescriptor); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); // Do a piecewise multiplication of the vectors first if constexpr (useSafeMUL) { - emitSafeMUL(src1_vec, src2_vec, scratch1); + emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { - FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); + FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); } - FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together - FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product + FADDP(src1Vec.S4(), src1Vec.S4(), src1Vec.S4()); // Now add the adjacent components together + FADDP(src1Vec.toS(), src1Vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product - if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x - DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx } - storeRegister(src1_vec, shader, dest, operandDescriptor); + storeRegister(src1Vec, shader, dest, operandDescriptor); +} + +oaknut::Label ShaderEmitter::emitLog2Func() { + oaknut::Label funcStart; + + // We perform this approximation by first performing a range reduction into the range + // [1.0, 2.0). A minimax polynomial which was fit for the function log2(x) / (x - 1) is then + // evaluated. We multiply the result by (x - 1) then restore the result into the appropriate + // range. Coefficients for the minimax polynomial. + // f(x) computes approximately log2(x) / (x - 1). + // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)). + oaknut::Label c0; + l(c0); + dw(0x3d74552f); + + oaknut::Label c14; + l(c14); + dw(0xbeee7397); + dw(0x3fbd96dd); + dw(0xc02153f6); + dw(0x4038d96c); + + oaknut::Label negativeInfinityVec; + l(negativeInfinityVec); + dw(0xff800000); + dw(0xff800000); + dw(0xff800000); + dw(0xff800000); + + oaknut::Label defaultQnanVec; + l(defaultQnanVec); + dw(0x7fc00000); + dw(0x7fc00000); + dw(0x7fc00000); + dw(0x7fc00000); + + oaknut::Label exit; + oaknut::Label inputIsZero; + oaknut::Label inputOutOfRange; + + l(inputOutOfRange); + B(Cond::EQ, inputIsZero); + ADR(scratch1, defaultQnanVec); + LDR(src1Vec, scratch1); + RET(); + + l(inputIsZero); + ADR(scratch1, negativeInfinityVec); + LDR(src1Vec, scratch1); + RET(); + + l(funcStart); + + // Here we handle edge cases: input in {NaN, 0, -Inf, Negative} + // Ordinal(n) ? 0xFFFFFFFF : 0x0 + FCMEQ(scratch1Vec.toS(), src1Vec.toS(), src1Vec.toS()); + MOV(scratch1.toW(), scratch1Vec.Selem()[0]); + + // src1Vec == NaN + CMP(scratch1.toW(), 0); + B(Cond::EQ, exit); + + // (0.0 >= n) ? 0xFFFFFFFF : 0x0 + MOV(scratch1.toW(), src1Vec.Selem()[0]); + + // src1Vec <= 0.0 + CMP(scratch1.toW(), 0); + B(Cond::LE, inputOutOfRange); + + // Split input: + // src1Vec = MANT[1,2) + // scratch2Vec = Exponent + MOV(scratch1.toW(), src1Vec.Selem()[0]); + MOV(scratch2.toW(), scratch1.toW()); + AND(scratch2.toW(), scratch2.toW(), 0x007fffff); + ORR(scratch2.toW(), scratch2.toW(), 0x3f800000); + MOV(src1Vec.Selem()[0], scratch2.toW()); + // src1Vec now contains the mantissa of the input + UBFX(scratch1.toW(), scratch1.toW(), 23, 8); + SUB(scratch1.toW(), scratch1.toW(), 0x7F); + MOV(scratch2Vec.Selem()[0], scratch1.toW()); + UCVTF(scratch2Vec.toS(), scratch2Vec.toS()); + // scratch2Vec now contains the exponent of the input + + ADR(scratch1, c0); + LDR(scratch1.toW(), scratch1); + MOV(scratch1Vec.Selem()[0], scratch1.toW()); + + // Complete computation of polynomial + // Load C1, C2, C3, C4 into a single scratch register + const QReg C14 = src2Vec; + ADR(scratch1, c14); + LDR(C14, scratch1); + FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS()); + FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[0]); + FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS()); + FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[1]); + FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS()); + FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[2]); + FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS()); + + FSUB(src1Vec.toS(), src1Vec.toS(), onesVector.toS()); + FMLA(scratch1Vec.toS(), onesVector.toS(), C14.Selem()[3]); + + FMUL(scratch1Vec.toS(), scratch1Vec.toS(), src1Vec.toS()); + FADD(scratch2Vec.toS(), scratch1Vec.toS(), scratch2Vec.toS()); + + // Duplicate result across vector + MOV(src1Vec.Selem()[0], scratch2Vec.Selem()[0]); + l(exit); + DUP(src1Vec.S4(), src1Vec.Selem()[0]); + + RET(); + + return funcStart; +} + +oaknut::Label ShaderEmitter::emitExp2Func() { + oaknut::Label funcStart; + + // This performs a range reduction into the range [-0.5, 0.5) + // A minmax polynomial which was fit for the function exp2(x) is then evaluated + // Then restore the result into the appropriate range + + oaknut::Label inputMax; + l(inputMax); + dw(0x43010000); + oaknut::Label inputMin; + l(inputMin); + dw(0xc2fdffff); + oaknut::Label half; + l(half); + dw(0x3f000000); + oaknut::Label c0; + l(c0); + dw(0x3c5dbe69); + dw(0x3d5509f9); + dw(0x3e773cc5); + dw(0x3f3168b3); + dw(0x3f800016); + + oaknut::Label exit; + + l(funcStart); + + FCMP(src1Vec.toS(), src1Vec.toS()); + // Branch if NaN + B(Cond::NE, exit); + + // Decompose input: + // scratch1Vec = 2^round(input) + // src1Vec = input-round(input) [-0.5, 0.5) + // Clamp to maximum range since we shift the value directly into the exponent + ADR(scratch1, inputMax); + LDR(scratch1Vec.toS(), scratch1, 0); + FMIN(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS()); + + LDR(scratch1Vec.toS(), scratch1, 4); + FMAX(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS()); + + ADR(scratch1, half); + LDR(scratch1Vec.toS(), scratch1); + FSUB(scratch1Vec.toS(), src1Vec.toS(), scratch1Vec.toS()); + + FCVTNS(scratch1Vec.toS(), scratch1Vec.toS()); + MOV(scratch1.toW(), scratch1Vec.Selem()[0]); + SCVTF(scratch1Vec.toS(), scratch1.toW()); + + // scratch1Vec now contains input rounded to the nearest integer + ADD(scratch1.toW(), scratch1.toW(), 0x7F); + FSUB(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS()); + // src1Vec contains input - round(input), which is in [-0.5, 0.5) + LSL(scratch1.toW(), scratch1.toW(), 23); + MOV(scratch1Vec.Selem()[0], scratch1.toW()); + // scratch1Vec contains 2^(round(input)) + + // Complete computation of polynomial + ADR(scratch2, c0); + LDR(scratch2Vec.toS(), scratch2, 0); + FMUL(scratch2Vec.toS(), src1Vec.toS(), scratch2Vec.toS()); + + LDR(scratch3Vec.toS(), scratch2, 4); + FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS()); + FMUL(scratch2Vec.toS(), scratch2Vec.toS(), src1Vec.toS()); + + LDR(scratch3Vec.toS(), scratch2, 8); + FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS()); + FMUL(scratch2Vec.toS(), scratch2Vec.toS(), src1Vec.toS()); + + LDR(scratch3Vec.toS(), scratch2, 12); + FADD(scratch2Vec.toS(), scratch2Vec.toS(), scratch3Vec.toS()); + FMUL(src1Vec.toS(), scratch2Vec.toS(), src1Vec.toS()); + + LDR(scratch3Vec.toS(), scratch2, 16); + FADD(src1Vec.toS(), scratch3Vec.toS(), src1Vec.toS()); + + FMUL(src1Vec.toS(), src1Vec.toS(), scratch1Vec.toS()); + + // Duplicate result across vector + l(exit); + DUP(src1Vec.S4(), src1Vec.Selem()[0]); + + RET(); + + return funcStart; } void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0) { @@ -534,10 +746,10 @@ void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QR // Both a FMUL and FMULX are done and the results are compared to each other // In the case that the results are diferent(a 0.0*inf happened), then // a 0.0 is written - FMULX(scratch1.S4(), src1.S4(), src2.S4()); + FMULX(scratch1Vec.S4(), src1.S4(), src2.S4()); FMUL(src1.S4(), src1.S4(), src2.S4()); - CMEQ(scratch1.S4(), scratch1.S4(), src1.S4()); - AND(src1.B16(), src1.B16(), scratch1.B16()); + CMEQ(scratch1Vec.S4(), scratch1Vec.S4(), src1.S4()); + AND(src1.B16(), src1.B16(), scratch1Vec.B16()); } void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) { @@ -547,10 +759,10 @@ void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); - FADD(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); - storeRegister(src1_vec, shader, dest, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); + FADD(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) { @@ -560,10 +772,10 @@ void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); - FMAX(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); - storeRegister(src1_vec, shader, dest, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); + FMAX(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) { @@ -573,10 +785,10 @@ void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); - FMIN(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); - storeRegister(src1_vec, shader, dest, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); + FMIN(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { @@ -586,16 +798,16 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); if constexpr (useSafeMUL) { - emitSafeMUL(src1_vec, src2_vec, scratch1); + emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { - FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); + FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); } - storeRegister(src1_vec, shader, dest, operandDescriptor); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) { @@ -605,16 +817,16 @@ void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) { const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = operandDescriptor & 0xf; - loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 - FDIV(src1_vec.toS(), onesVector.toS(), src1_vec.toS()); // src1 = 1.0 / src1 + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec + FDIV(src1Vec.toS(), onesVector.toS(), src1Vec.toS()); // src1 = 1.0 / src1 // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx // Otherwise we do - if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x - DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx } - storeRegister(src1_vec, shader, dest, operandDescriptor); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) { @@ -625,7 +837,7 @@ void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) { const u32 writeMask = operandDescriptor & 0xf; constexpr bool useAccurateRSQ = true; - loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec // Compute reciprocal square root approximation // TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate @@ -633,19 +845,19 @@ void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) { // It doesn't have regular sqrt/div instructions. // For now, we default to accurate inverse square root if constexpr (useAccurateRSQ) { - FSQRT(src1_vec.toS(), src1_vec.toS()); // src1 = sqrt(src1), scalar - FDIV(src1_vec.toS(), onesVector.toS(), src1_vec.toS()); // Now invert src1 + FSQRT(src1Vec.toS(), src1Vec.toS()); // src1 = sqrt(src1), scalar + FDIV(src1Vec.toS(), onesVector.toS(), src1Vec.toS()); // Now invert src1 } else { - FRSQRTE(src1_vec.toS(), src1_vec.toS()); // Much nicer + FRSQRTE(src1Vec.toS(), src1Vec.toS()); // Much nicer } // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx // Otherwise we do - if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x - DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + DUP(src1Vec.S4(), src1Vec.Selem()[0]); // src1Vec = src1Vec.xxxx } - storeRegister(src1_vec, shader, dest, operandDescriptor); + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { @@ -658,17 +870,17 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<22, 2>(instruction); const u32 dest = getBits<24, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, 0, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor); - loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, 0, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, isMADI ? 0 : idx, operandDescriptor); + loadRegister<3>(src3Vec, shader, src3, isMADI ? idx : 0, operandDescriptor); if constexpr (useSafeMUL) { - emitSafeMUL(src1_vec, src2_vec, scratch1); - FADD(src3_vec.S4(), src3_vec.S4(), src1_vec.S4()); + emitSafeMUL(src1Vec, src2Vec, scratch1Vec); + FADD(src3Vec.S4(), src3Vec.S4(), src1Vec.S4()); } else { - FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4()); + FMLA(src3Vec.S4(), src1Vec.S4(), src2Vec.S4()); } - storeRegister(src3_vec, shader, dest, operandDescriptor); + storeRegister(src3Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) { @@ -680,13 +892,13 @@ void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, isSLTI ? 0 : idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, isSLTI ? idx : 0, operandDescriptor); - // Set each lane of SRC1 to FFFFFFFF if src2 > src1, else to 0. NEON does not have FCMLT so we use FCMGT with inverted operands + loadRegister<1>(src1Vec, shader, src1, isSLTI ? 0 : idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, isSLTI ? idx : 0, operandDescriptor); + // Set each lane of src1Vec to FFFFFFFF if src2 > src1, else to 0. NEON does not have FCMLT so we use FCMGT with inverted operands // This is more or less a direct port of the relevant x64 JIT code - FCMGT(src1_vec.S4(), src2_vec.S4(), src1_vec.S4()); - AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 - storeRegister(src1_vec, shader, dest, operandDescriptor); + FCMGT(src1Vec.S4(), src2Vec.S4(), src1Vec.S4()); + AND(src1Vec.B16(), src1Vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) { @@ -698,13 +910,13 @@ void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - loadRegister<1>(src1_vec, shader, src1, isSGEI ? 0 : idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, isSGEI ? idx : 0, operandDescriptor); - // Set each lane of SRC1 to FFFFFFFF if src1 >= src2, else to 0. + loadRegister<1>(src1Vec, shader, src1, isSGEI ? 0 : idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, isSGEI ? idx : 0, operandDescriptor); + // Set each lane of src1Vec to FFFFFFFF if src1 >= src2, else to 0. // This is more or less a direct port of the relevant x64 JIT code - FCMGE(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); - AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 - storeRegister(src1_vec, shader, dest, operandDescriptor); + FCMGE(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); + AND(src1Vec.B16(), src1Vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 + storeRegister(src1Vec, shader, dest, operandDescriptor); } void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { @@ -715,8 +927,8 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { const u32 cmpY = getBits<21, 3>(instruction); const u32 cmpX = getBits<24, 3>(instruction); - loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor); - loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor); + loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); // Map from PICA condition codes (used as index) to x86 condition codes // We treat invalid condition codes as "always" as suggested by 3DBrew @@ -729,13 +941,13 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { const size_t cmpRegXOffset = uintptr_t(&shader.cmpRegister[0]) - uintptr_t(&shader); // NEON doesn't have SIMD comparisons to do fun stuff with like on x64 - FCMP(src1_vec.toS(), src2_vec.toS()); + FCMP(src1Vec.toS(), src2Vec.toS()); CSET(W0, conditionCodes[cmpX]); // Compare Y components, which annoyingly enough can't be done without moving - MOV(scratch1.toS(), src1_vec.Selem()[1]); - MOV(scratch2.toS(), src2_vec.Selem()[1]); - FCMP(scratch1.toS(), scratch2.toS()); + MOV(scratch1Vec.toS(), src1Vec.Selem()[1]); + MOV(scratch2Vec.toS(), src2Vec.Selem()[1]); + FCMP(scratch1Vec.toS(), scratch2Vec.toS()); CSET(W1, conditionCodes[cmpY]); // Merge the booleans and write them back in one STRh @@ -915,6 +1127,19 @@ void ShaderEmitter::recJMPU(const PICAShader& shader, u32 instruction) { } } +void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec + STR(X30, SP, POST_INDEXED, -16); + BL(log2Func); + LDR(X30, SP, PRE_INDEXED, 16); + storeRegister(src1Vec, shader, dest, operandDescriptor); +} + void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { const u32 dest = getBits<10, 12>(instruction); const u32 uniformIndex = getBits<22, 2>(instruction); @@ -979,4 +1204,17 @@ void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { RET(); } +void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1Vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1Vec + STR(X30, SP, POST_INDEXED, -16); + BL(exp2Func); + LDR(X30, SP, PRE_INDEXED, 16); + storeRegister(src1Vec, shader, dest, operandDescriptor); +} + #endif diff --git a/tests/shader.cpp b/tests/shader.cpp index edb2743f..29c07481 100644 --- a/tests/shader.cpp +++ b/tests/shader.cpp @@ -7,6 +7,7 @@ #include #include #include +#include using namespace Floats; static const nihstro::SourceRegister input0 = nihstro::SourceRegister::MakeInput(0); @@ -28,48 +29,59 @@ static std::unique_ptr assembleVertexShader(std::initializer_list shader = {}; + virtual void runShader() { shader->run(); } + public: explicit ShaderInterpreterTest(std::initializer_list code) : shader(assembleVertexShader(code)) {} - // Multiple inputs, singular scalar output - float runScalar(std::initializer_list inputs) { - usize inputIndex = 0; - for (const float& input : inputs) { - const std::array input_vec = std::array{f24::fromFloat32(input), f24::zero(), f24::zero(), f24::zero()}; - shader->inputs[inputIndex++] = input_vec; - } - shader->run(); - return shader->outputs[0][0]; + std::span> runTest(std::span> inputs) { + std::copy(inputs.begin(), inputs.end(), shader->inputs.begin()); + runShader(); + return shader->outputs; } + // Each input is written to the x component of sequential input registers + // The first output vector is returned + const std::array& runVector(std::initializer_list inputs) { + std::vector> inputsVec; + for (const float& input : inputs) { + const std::array inputVec = { + f24::fromFloat32(input), + f24::zero(), + f24::zero(), + f24::zero(), + }; + inputsVec.emplace_back(inputVec); + } + return runTest(inputsVec)[0]; + } + + // Each input is written to the x component of sequential input registers + // The x component of the first output + float runScalar(std::initializer_list inputs) { return runVector(inputs)[0].toFloat32(); } + + [[nodiscard]] std::array, 96>& floatUniforms() const { return shader->floatUniforms; } + [[nodiscard]] std::array, 4>& intUniforms() const { return shader->intUniforms; } + [[nodiscard]] u32& boolUniform() const { return shader->boolUniform; } + static std::unique_ptr assembleTest(std::initializer_list code) { return std::make_unique(code); } }; #if defined(PANDA3DS_SHADER_JIT_SUPPORTED) -class ShaderJITTest final { +class ShaderJITTest final : public ShaderInterpreterTest { private: - std::unique_ptr shader = {}; ShaderJIT shaderJit = {}; - public: - explicit ShaderJITTest(std::initializer_list code) : shader(assembleVertexShader(code)) { shaderJit.prepare(*shader.get()); } + void runShader() override { shaderJit.run(*shader); } - // Multiple inputs, singular scalar output - float runScalar(std::initializer_list inputs) { - usize inputIndex = 0; - for (const float& input : inputs) { - const std::array input_vec = std::array{f24::fromFloat32(input), f24::zero(), f24::zero(), f24::zero()}; - shader->inputs[inputIndex++] = input_vec; - } - shaderJit.run(*shader.get()); - return shader->outputs[0][0]; - } + public: + explicit ShaderJITTest(std::initializer_list code) : ShaderInterpreterTest(code) { shaderJit.prepare(*shader); } static std::unique_ptr assembleTest(std::initializer_list code) { return std::make_unique(code);