diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 0338911c..1052d6a0 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator { Label negateVector; // Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i) Label onesVector; + // Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3 + Label dp3Vector; u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) @@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator { Xbyak::Label emitExp2Func(); Xbyak::util::Cpu cpuCaps; + // Emit a PICA200-compliant multiplication that handles "0 * inf = 0" + void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); + // Compile all instructions from [current recompiler PC, end) void compileUntil(const PICAShader& shaderUnit, u32 endPC); // Compile instruction "instr" diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index c134b72f..e7bafe9f 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -12,6 +12,9 @@ using namespace Xbyak; using namespace Xbyak::util; using namespace Helpers; +// TODO: Expose safe/unsafe optimizations to the user +constexpr bool useSafeMUL = false; + // The shader recompiler uses quite an odd internal ABI // We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code // This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers @@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { L(onesVector); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times + if (useSafeMUL) { + // When doing safe mul, we need a vector to set only the w component to 0 for DP3 + L(dp3Vector); + + dd(0xFFFFFFFF); + dd(0xFFFFFFFF); + dd(0xFFFFFFFF); + dd(0); + } + // Emit prologue first align(16); prologueCb = getCurr(); @@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); - dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + + if (!useSafeMUL) { + dpps(src1_xmm, src2_xmm, 0b01111111); + } else { + const u32 writeMask = operandDescriptor & 0xf; + + // Set w component to 0 and do a DP4 + andps(src1_xmm, xword[rip + dp3Vector]); + + // Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds + emitSafeMUL(src1_xmm, src2_xmm, scratch1); + haddps(src1_xmm, src1_xmm); + haddps(src1_xmm, src1_xmm); + + // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx + // Otherwise we do + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx + } + } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); - dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + + if (!useSafeMUL) { + // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + dpps(src1_xmm, src2_xmm, 0b11111111); + } else { + const u32 writeMask = operandDescriptor & 0xf; + + // Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds + emitSafeMUL(src1_xmm, src2_xmm, scratch1); + haddps(src1_xmm, src1_xmm); + haddps(src1_xmm, src1_xmm); + + // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx + // Otherwise we do + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx + } + } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } @@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor); @@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) { unpcklpd(src1_xmm, scratch1); } - dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + // Now perform a DP4 + if (!useSafeMUL) { + // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA + dpps(src1_xmm, src2_xmm, 0b11111111); + } else { + const u32 writeMask = operandDescriptor & 0xf; + + // Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds + emitSafeMUL(src1_xmm, src2_xmm, scratch1); + haddps(src1_xmm, src1_xmm); + haddps(src1_xmm, src1_xmm); + + // If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx + // Otherwise we do + if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x + shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx + } + } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } @@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); - // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); - mulps(src1_xmm, src2_xmm); + + if (!useSafeMUL) { + mulps(src1_xmm, src2_xmm); + } else { + emitSafeMUL(src1_xmm, src2_xmm, scratch1); + } + storeRegister(src1_xmm, shader, dest, operandDescriptor); } @@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor); loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor); - // TODO: Implement safe PICA mul // If we have FMA3, optimize MAD to use FMA - if (haveFMA3) { - vfmadd213ps(src1_xmm, src2_xmm, src3_xmm); - storeRegister(src1_xmm, shader, dest, operandDescriptor); - } - - // If we don't have FMA3, do a multiplication and addition - else { - // Multiply src1 * src2 - if (haveAVX) { - vmulps(scratch1, src1_xmm, src2_xmm); - } else { - movaps(scratch1, src1_xmm); - mulps(scratch1, src2_xmm); + if (!useSafeMUL) { + if (haveFMA3) { + vfmadd213ps(src1_xmm, src2_xmm, src3_xmm); + storeRegister(src1_xmm, shader, dest, operandDescriptor); } + // If we don't have FMA3, do a multiplication and addition + else { + // Multiply src1 * src2 + if (haveAVX) { + vmulps(scratch1, src1_xmm, src2_xmm); + } else { + movaps(scratch1, src1_xmm); + mulps(scratch1, src2_xmm); + } + + // Add src3 + addps(scratch1, src3_xmm); + storeRegister(scratch1, shader, dest, operandDescriptor); + } + } else { + movaps(scratch1, src1_xmm); + emitSafeMUL(scratch1, src2_xmm, src1_xmm); + // Add src3 addps(scratch1, src3_xmm); storeRegister(scratch1, shader, dest, operandDescriptor); @@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() { return subroutine; } +void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) { + // 0 * inf and inf * 0 in the PICA should return 0 instead of NaN + // This can be done by checking for NaNs before and after a multiplication + // To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7) + // Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps + // If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication + // result with the xor result + // Based on Citra implementation, particularly the AVX-512 version + + if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { + const Xbyak::Opmask zeroMask = k1; + + vmulps(scratch, src1, src2); + // Mask of any NaN values found in the result + vcmpunordps(zeroMask, scratch, scratch); + // Mask of any non-NaN inputs producing NaN results + vcmpordps(zeroMask | zeroMask, src1, src2); + + knotb(zeroMask, zeroMask); + vmovaps(src1 | zeroMask | T_z, scratch); + } else { + if (haveAVX) { + vcmpordps(scratch, src1, src2); + } else { + movaps(scratch, src1); + cmpordps(scratch, src2); + } + + mulps(src1, src2); + cmpunordps(src2, src1); + xorps(src2, scratch); + andps(src1, src2); + } +} + Xbyak::Label ShaderEmitter::emitExp2Func() { Xbyak::Label subroutine;