From 137b65284020876536266e6901373a0beed13ef8 Mon Sep 17 00:00:00 2001 From: wheremyfoodat Date: Sun, 11 Jun 2023 23:43:47 +0300 Subject: [PATCH] [Shader JIT] Attempt to add CALLC/JMPC/JMPU/LOOP --- .../PICA/dynapica/shader_rec_emitter_x64.hpp | 8 ++ include/logger.hpp | 1 + .../PICA/dynapica/shader_rec_emitter_x64.cpp | 101 +++++++++++++++++- 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index ea391973..636f88c2 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -3,6 +3,7 @@ // Only do anything if we're on an x64 target with JIT support enabled #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST) #include "helpers.hpp" +#include "logger.hpp" #include "PICA/shader.hpp" #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" @@ -31,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator { Label negateVector; u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ + u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) + bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX) @@ -39,6 +42,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Compile instruction "instr" void compileInstruction(const PICAShader& shaderUnit); + // Get the offset to be added to the rsp register to get the current return address + size_t getStackOffsetOfReturnPC(); + bool isCall(u32 instruction) { const u32 opcode = instruction >> 26; return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU); @@ -91,6 +97,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator { void recSLT(const PICAShader& shader, u32 instruction); void recSLTI(const PICAShader& shader, u32 instruction); + MAKE_LOG_FUNCTION(log, shaderJITLogger) + public: using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions // Callback type used for the JIT prologue. This is what the caller will call diff --git a/include/logger.hpp b/include/logger.hpp index 2ec69699..62129707 100644 --- a/include/logger.hpp +++ b/include/logger.hpp @@ -26,6 +26,7 @@ namespace Log { static Logger threadLogger; static Logger gpuLogger; static Logger rendererLogger; + static Logger shaderJITLogger; // Service loggers static Logger acLogger; diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 9b9a3b04..33fefb58 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -51,6 +51,7 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { // Compile every instruction in the shader // This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing recompilerPC = 0; + loopLevel = 0; compileUntil(shaderUnit, PICAShader::maxInstructionCount); } @@ -78,15 +79,20 @@ void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) { } } +// This is the offset we need to add to rsp to peek the next return address in the callstack +// Ie the PICA PC address which, when reached, will trigger a return +size_t ShaderEmitter::getStackOffsetOfReturnPC() { + size_t ret = isWindows() ? (8 + 32) : 8; // Offset assuming 0 loop level + return ret; +} + void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { // Write current location to label for this instruction L(instructionLabels[recompilerPC]); // See if PC is a possible return PC and emit the proper code if so if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) { - // This is the offset we need to add to rsp to peek the next return address in the callstack - // Ie the PICA PC address which, when reached, will trigger a return - const auto stackOffsetForPC = isWindows() ? (8 + 32) : 8; + const auto stackOffsetForPC = getStackOffsetOfReturnPC(); Label end; // Check if return address == recompilerPC, ie if we should return @@ -106,6 +112,9 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break; + case ShaderOpcodes::CALLC: + recCALLC(shaderUnit, instruction); + break; case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break; @@ -114,6 +123,9 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break; case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break; + case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break; + case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break; + case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break; case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break; case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break; @@ -679,4 +691,87 @@ void ShaderEmitter::recCALL(const PICAShader& shader, u32 instruction) { add(rsp, isWindows() ? (8 + 32) : 8); } +void ShaderEmitter::recCALLC(const PICAShader& shader, u32 instruction) { + Label skipCall; + + // z is 1 if the call should be taken, 0 otherwise + checkCmpRegister(shader, instruction); + jnz(skipCall); + recCALL(shader, instruction); + + L(skipCall); +} + +void ShaderEmitter::recJMPC(const PICAShader& shader, u32 instruction) { + const u32 dest = getBits<10, 12>(instruction); + + Label& l = instructionLabels[dest]; + // Z is 1 if the comparison is true + checkCmpRegister(shader, instruction); + jz(l, T_NEAR); +} + +void ShaderEmitter::recJMPU(const PICAShader& shader, u32 instruction) { + bool jumpIfFalse = instruction & 1; // If the LSB is 0 we want to compare to true, otherwise compare to false + const u32 dest = getBits<10, 12>(instruction); + + Label& l = instructionLabels[dest]; + // Z is 0 if the uniform is true + checkBoolUniform(shader, instruction); + + if (jumpIfFalse) { + jz(l, T_NEAR); + } else { + jnz(l, T_NEAR); + } +} + +void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { + const u32 dest = getBits<10, 12>(instruction); + const u32 uniformIndex = getBits<22, 2>(instruction); + + if (loopLevel > 0) { + log("[Shader JIT] Detected nested loop. Might be broken?\n"); + } + + if (dest < recompilerPC) { + Helpers::panic("[Shader JIT] Detected backwards loop\n"); + } + + loopLevel++; + + // Offset of the uniform + const uintptr_t uniformOffset = uintptr_t(&shader.intUniforms[uniformIndex]) - uintptr_t(&shader); + // Offset of the loop register + const uintptr_t loopRegOffset = uintptr_t(&shader.loopCounter) - uintptr_t(&shader); + + movzx(eax, byte[statePointer + uniformOffset]); // eax = loop iteration count + movzx(ecx, byte[statePointer + uniformOffset + sizeof(u8)]); // ecx = initial loop counter value + movzx(edx, byte[statePointer + uniformOffset + 2 * sizeof(u8)]); // edx = loop increment + + add(eax, 1); // The iteration count is actually uniform.x + 1 + mov(dword[statePointer + loopRegOffset], ecx); // Set loop counter + + // TODO: This might break if an instruction in a loop decides to yield... + push(rax); // Push loop iteration counter + push(rdx); // Push loop increment + if constexpr (isWindows()) + sub(rsp, 32); + + Label loopStart; + L(loopStart); + compileUntil(shader, dest + 1); + + const size_t stackOffsetOfLoopIncrement = isWindows() ? 32 : 0; + const size_t stackOffsetOfIterationCounter = stackOffsetOfLoopIncrement + 8; + + mov(ecx, dword[rsp + stackOffsetOfLoopIncrement]); // ecx = Loop increment + add(dword[statePointer + loopRegOffset], ecx); // Increment loop counter + sub(dword[rsp + stackOffsetOfIterationCounter], 1); // Subtract 1 from loop iteration counter + + jnz(loopStart); // Back to loop start if not over + add(rsp, isWindows() ? (16 + 32) : 16); + loopLevel--; +} + #endif \ No newline at end of file