diff --git a/include/PICA/dynapica/shader_rec.hpp b/include/PICA/dynapica/shader_rec.hpp index 9aa97947..711dabb0 100644 --- a/include/PICA/dynapica/shader_rec.hpp +++ b/include/PICA/dynapica/shader_rec.hpp @@ -15,10 +15,10 @@ class ShaderJIT { #ifdef PANDA3DS_SHADER_JIT_SUPPORTED using Hash = PICAShader::Hash; using ShaderCache = std::unordered_map>; - ShaderEmitter::Callback activeShaderCallback; + ShaderEmitter::PrologueCallback prologueCallback; + ShaderEmitter::InstructionCallback entrypointCallback; ShaderCache cache; - void compileShader(PICAShader& shaderUnit); #endif public: @@ -26,8 +26,12 @@ public: // Call this before starting to process a batch of vertices // This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader // If yes, it sets it as the active shader. if not, then it compiles it, adds it to the cache, and sets it as active, + // The caller must make sure the entrypoint has been properly set beforehand void prepare(PICAShader& shaderUnit); void reset(); + void run(PICAShader& shaderUnit) { + prologueCallback(shaderUnit, entrypointCallback); + } static constexpr bool isAvailable() { return true; } #else @@ -42,6 +46,4 @@ public: void reset() {} static constexpr bool isAvailable() { return false; } #endif - - auto getCallback() { return activeShaderCallback; } }; \ No newline at end of file diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index b4022587..5b73e63b 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -7,17 +7,51 @@ #include "xbyak/xbyak.h" #include "x64_regs.hpp" +#include + class ShaderEmitter : public Xbyak::CodeGenerator { static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader // Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size static constexpr size_t allocSize = executableMemorySize + 0x1000; + // An array of labels (incl pointers) to each compiled (to x64) PICA instruction + std::array instructionLabels; + // A vector of PCs that can potentially return based on the state of the PICA callstack. + // Filled before compiling a shader by scanning the code for call instructions + std::vector returnPCs; + + u32 recompilerPC; // PC the recompiler is currently recompiling @ + + // Compile all instructions from [current recompiler PC, end) + void compileUntil(const PICAShader& shaderUnit, u32 endPC); + // Compile instruction "instr" + void compileInstruction(const PICAShader& shaderUnit); + + bool isCall(u32 instruction) { + const u32 opcode = instruction >> 26; + return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU); + } + // Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation + void scanForCalls(const PICAShader& shader); + public: - using Callback = void(*)(const PICAShader& shaderUnit); + using InstructionCallback = void(*)(PICAShader& shaderUnit); // Callback type used for instructions + // Callback type used for the JIT prologue. This is what the caller will call + using PrologueCallback = void(*)(PICAShader& shaderUnit, InstructionCallback cb); + PrologueCallback prologueCb; // Initialize our emitter with "allocSize" bytes of RWX memory ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {} void compile(const PICAShader& shaderUnit); + + // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does + InstructionCallback getInstructionCallback(u32 pc) { + return reinterpret_cast(instructionLabels.at(pc).getAddress()); + } + + PrologueCallback getPrologueCallback() { + return prologueCb; + } }; #endif // x64 recompiler check \ No newline at end of file diff --git a/src/core/PICA/dynapica/shader_rec.cpp b/src/core/PICA/dynapica/shader_rec.cpp index 3029c0e2..0e182b60 100644 --- a/src/core/PICA/dynapica/shader_rec.cpp +++ b/src/core/PICA/dynapica/shader_rec.cpp @@ -17,9 +17,15 @@ void ShaderJIT::prepare(PICAShader& shaderUnit) { if (it == cache.end()) { // Block has not been compiled yet auto emitter = std::make_unique(); emitter->compile(shaderUnit); + // Get pointer to callbacks + entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint); + prologueCallback = emitter->getPrologueCallback(); + cache.emplace_hint(it, hash, std::move(emitter)); } else { // Block has been compiled and found, use it auto emitter = it->second.get(); + entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint); + prologueCallback = emitter->getPrologueCallback(); } } #endif // PANDA3DS_SHADER_JIT_SUPPORTED \ No newline at end of file diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 0511cd95..6179df0b 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -4,8 +4,70 @@ using namespace Xbyak; using namespace Xbyak::util; -void ShaderEmitter::compile(const PICAShader& shaderUnit) { +// Register that points to PICA state +static constexpr Reg64 statePointer = rbp; +void ShaderEmitter::compile(const PICAShader& shaderUnit) { + // Emit prologue first + align(16); + prologueCb = getCurr(); + + // We assume arg1 contains the pointer to the PICA state and arg2 a pointer to the code for the entrypoint + push(statePointer); // Back up state pointer to stack. This also aligns rsp to 16 bytes for calls + mov(statePointer, (uintptr_t)&shaderUnit); // Set state pointer to the proper pointer + + // If we add integer register allocations they should be pushed here, and the rsp should be properly fixed up + // However most of the PICA is floating point so yeah + + // Allocate shadow stack on Windows + if constexpr (isWindows()) { + sub(rsp, 32); + } + // Tail call to shader code entrypoint + jmp(arg2); + align(16); + // Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works + // Pretty weirdly + scanForCalls(shaderUnit); + + // Compile every instruction in the shader + // This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing + recompilerPC = 0; + compileUntil(shaderUnit, PICAShader::maxInstructionCount); +} + +void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) { + returnPCs.clear(); + + for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) { + const u32 instruction = shaderUnit.loadedShader[i]; + if (isCall(instruction)) { + const u32 num = instruction & 0xff; // Num of instructions to execute + const u32 dest = (instruction >> 10) & 0xfff; // Starting subroutine address + const u32 returnPC = num + dest; // Add them to get the return PC + + returnPCs.push_back(returnPC); + } + } +} + +void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) { + while (recompilerPC < end) { + compileInstruction(shaderUnit); + } +} + +void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { + // Write current location to label for this instruction + L(instructionLabels[recompilerPC]); + // Fetch instruction and inc PC + const u32 instruction = shaderUnit.loadedShader[recompilerPC++]; + const u32 opcode = instruction >> 26; + + switch (opcode) { + default: + Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode); + } } #endif \ No newline at end of file diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index d7fa8ce8..e0951080 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -203,7 +203,12 @@ void GPU::drawArrays() { std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); } - shaderUnit.vs.run(); + if constexpr (useShaderJIT) { + shaderJIT.run(shaderUnit.vs); + } else { + shaderUnit.vs.run(); + } + std::memcpy(&vertices[i].position, &shaderUnit.vs.outputs[0], sizeof(vec4f)); std::memcpy(&vertices[i].colour, &shaderUnit.vs.outputs[1], sizeof(vec4f)); std::memcpy(&vertices[i].UVs, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));