diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 99517bd1..5c3c403c 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -5,6 +5,7 @@ #include "helpers.hpp" #include "PICA/shader.hpp" #include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" #include "x64_regs.hpp" #include @@ -14,13 +15,20 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size static constexpr size_t allocSize = executableMemorySize + 0x1000; + // If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle + static constexpr uint noSwizzle = 0x1B; + + using f24 = Floats::f24; + using vec4f = OpenGL::Vector; + // An array of labels (incl pointers) to each compiled (to x64) PICA instruction std::array instructionLabels; // A vector of PCs that can potentially return based on the state of the PICA callstack. // Filled before compiling a shader by scanning the code for call instructions std::vector returnPCs; - u32 recompilerPC; // PC the recompiler is currently recompiling @ + u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ + bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 // Compile all instructions from [current recompiler PC, end) void compileUntil(const PICAShader& shaderUnit, u32 endPC); @@ -35,7 +43,12 @@ class ShaderEmitter : public Xbyak::CodeGenerator { void scanForCalls(const PICAShader& shaderUnit); // Load register with number "srcReg" indexed by index "idx" into the xmm register "reg" - void loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 idx); + template + void loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor); + void storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor); + + const vec4f& getSourceRef(const PICAShader& shader, u32 src); + const vec4f& getDestRef(const PICAShader& shader, u32 dest); // Instruction recompilation functions void recMOV(const PICAShader& shader, u32 instruction); @@ -44,15 +57,22 @@ public: using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions // Callback type used for the JIT prologue. This is what the caller will call using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb); - PrologueCallback prologueCb; + PrologueCallback prologueCb = nullptr; // Initialize our emitter with "allocSize" bytes of RWX memory - ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {} + ShaderEmitter() : Xbyak::CodeGenerator(allocSize) { + const auto cpu = Xbyak::util::Cpu(); + + haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41); + } + void compile(const PICAShader& shaderUnit); // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does InstructionCallback getInstructionCallback(u32 pc) { - return reinterpret_cast(instructionLabels.at(pc).getAddress()); + // Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr + uint8_t* ptr = const_cast(instructionLabels.at(pc).getAddress()); + return reinterpret_cast(ptr); } PrologueCallback getPrologueCallback() { diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index a2f830b1..284438bf 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -42,6 +42,7 @@ namespace ShaderOpcodes { }; } +// Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT class PICAShader { using f24 = Floats::f24; using vec4f = OpenGL::Vector; @@ -74,7 +75,7 @@ class PICAShader { protected: std::array operandDescriptors; - std::array tempRegisters; // General purpose registers the shader can use for temp values + alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values OpenGL::Vector addrRegister; // Address register bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in u32 loopCounter; @@ -104,10 +105,10 @@ protected: friend class ShaderJIT; friend class ShaderEmitter; -private: vec4f getSource(u32 source); vec4f& getDest(u32 dest); +private: // Interpreter functions for the various shader functions void add(u32 instruction); void call(u32 instruction); @@ -193,11 +194,11 @@ public: u32 entrypoint = 0; // Initial shader PC u32 boolUniform; std::array, 4> intUniforms; - std::array floatUniforms; + alignas(16) std::array floatUniforms; - std::array fixedAttributes; // Fixed vertex attributes - std::array inputs; // Attributes passed to the shader - std::array outputs; + alignas(16) std::array fixedAttributes; // Fixed vertex attributes + alignas(16) std::array inputs; // Attributes passed to the shader + alignas(16) std::array outputs; PICAShader(ShaderType type) : type(type) {} diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index bc6bd916..d06b9db2 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -73,27 +73,117 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { switch (opcode) { case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; default: - Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode); + Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode); } } -void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 index) { +const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) { + alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); + if (src < 0x10) + return shader.inputs[src]; + else if (src < 0x20) + return shader.tempRegisters[src - 0x10]; + else if (src <= 0x7f) + return shader.floatUniforms[src - 0x20]; + else { + Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src); + return dummy; + } +} + +const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) { + if (dest < 0x10) { + return shader.outputs[dest]; + } else if (dest < 0x20) { + return shader.tempRegisters[dest - 0x10]; + } + Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest); +} + +// See shader.hpp header for docs on how the swizzle and negate works +template +void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) { + u32 compSwizzle; // Component swizzle pattern for the register + bool negate; // If true, negate all lanes of the register + + if constexpr (sourceIndex == 1) { // SRC1 + negate = ((operandDescriptor >> 4) & 1) != 0; + compSwizzle = (operandDescriptor >> 5) & 0xff; + } + else if constexpr (sourceIndex == 2) { // SRC2 + negate = ((operandDescriptor >> 13) & 1) != 0; + compSwizzle = (operandDescriptor >> 14) & 0xff; + } + else if constexpr (sourceIndex == 3) { // SRC3 + negate = ((operandDescriptor >> 22) & 1) != 0; + compSwizzle = (operandDescriptor >> 23) & 0xff; + } + + // PICA has the swizzle descriptor inverted in comparison to x86. For the PICA, the descriptor is (lowest to highest bits) wzyx while it's xyzw for x86 + u32 convertedSwizzle = ((compSwizzle >> 6) & 0b11) | (((compSwizzle >> 4) & 0b11) << 2) | (((compSwizzle >> 2) & 0b11) << 4) | ((compSwizzle & 0b11) << 6); + + switch (index) { + case 0: [[likely]] { // Keep src as is, no need to offset it + const vec4f& srcRef = getSourceRef(shader, src); + const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct + + if (compSwizzle == noSwizzle) // Avoid emitting swizzle if not necessary + movaps(dest, xword[statePointer + offset]); + else // Swizzle is not trivial so we need to emit a shuffle instruction + pshufd(dest, xword[statePointer + offset], convertedSwizzle); + return; + } + + default: + Helpers::panic("[ShaderJIT]: Unimplemented source index type"); + } + + if (negate) { + Helpers::panic("[ShaderJIT] Unimplemented register negation"); + } + + Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource"); +} + +void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) { + const vec4f& destRef = getDestRef(shader, dest); + const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct + + // Mask of which lanes to write + u32 writeMask = operandDescriptor & 0xf; + if (writeMask == 0xf) { // No lanes are masked, just movaps + movaps(xword[statePointer + offset], source); + } else if (haveSSE4_1) { + // Bit reverse the write mask because that is what blendps expects + u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000); + movaps(scratch1, xword[statePointer + offset]); // Read current value of dest + blendps(scratch1, source, adjustedMask); // Blend with source + movaps(xword[statePointer + offset], scratch1); // Write back + } else { + // Blend algo referenced from Citra + const u8 selector = (((writeMask & 0b1000) ? 1 : 0) << 0) | + (((writeMask & 0b0100) ? 3 : 2) << 2) | + (((writeMask & 0b0010) ? 0 : 1) << 4) | + (((writeMask & 0b0001) ? 2 : 3) << 6); + + movaps(scratch1, xword[statePointer + offset]); + movaps(scratch2, source); + unpckhps(scratch2, scratch1); // Unpack X/Y components of source and destination + unpcklps(scratch1, source); // Unpack Z/W components of source and destination + shufps(scratch1, scratch2, selector); // "merge-shuffle" dest and source using selecto + movaps(xword[statePointer + offset], scratch1); // Write back + } } void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { - /* const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; u32 src = (instruction >> 12) & 0x7f; const u32 idx = (instruction >> 19) & 3; const u32 dest = (instruction >> 21) & 0x1f; - src = getIndexedSource(src, idx); - vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor); - vec4f& destVector = getDest(dest); - - u32 componentMask = operandDescriptor & 0xf; - */ + loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 + storeRegister(scratch1, shader, dest, operandDescriptor); } #endif \ No newline at end of file