From 28e9d87573ac72e984e4f0a84a4e90585af5c216 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 28 Jun 2023 22:22:09 +0300 Subject: [PATCH] [Shader JIT] Start migrating to new, better ABI --- .../PICA/dynapica/shader_rec_emitter_x64.hpp | 6 +- include/PICA/dynapica/x64_regs.hpp | 2 + .../PICA/dynapica/shader_rec_emitter_x64.cpp | 57 ++++++++----------- 3 files changed, 29 insertions(+), 36 deletions(-) diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 636f88c2..4b26b80c 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -42,9 +42,6 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Compile instruction "instr" void compileInstruction(const PICAShader& shaderUnit); - // Get the offset to be added to the rsp register to get the current return address - size_t getStackOffsetOfReturnPC(); - bool isCall(u32 instruction) { const u32 opcode = instruction >> 26; return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU); @@ -68,6 +65,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Result is returned in the zero flag. If the comparison is true then zero == 0, else zero == 1 (Opposite of checkCmpRegister) void checkBoolUniform(const PICAShader& shader, u32 instruction); + // Emit a call to a C++ function + void callCppFunc(void* function) { Helpers::panic("[ShaderJIT] Unimplemented: Add support for calling C++ functions in JITted code"); } + // Instruction recompilation functions void recADD(const PICAShader& shader, u32 instruction); void recCALL(const PICAShader& shader, u32 instruction); diff --git a/include/PICA/dynapica/x64_regs.hpp b/include/PICA/dynapica/x64_regs.hpp index 47bcec7d..16bc7ca3 100644 --- a/include/PICA/dynapica/x64_regs.hpp +++ b/include/PICA/dynapica/x64_regs.hpp @@ -6,6 +6,7 @@ using namespace Xbyak; using namespace Xbyak::util; #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) +#define PANDA3DS_MS_ABI constexpr Reg32 arg1 = ecx; // register where first arg is stored constexpr Reg32 arg2 = edx; // register where second arg is stored constexpr Reg32 arg3 = r8d; // register where third arg is stored @@ -20,6 +21,7 @@ constexpr Xmm arg4f = xmm3; constexpr bool isWindows() { return true; } #else // System V calling convention +#define PANDA3DS_SYSV_ABI constexpr Reg32 arg1 = edi; constexpr Reg32 arg2 = esi; constexpr Reg32 arg3 = edx; diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 596dcb02..68b6fb08 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -12,14 +12,31 @@ using namespace Xbyak; using namespace Xbyak::util; using namespace Helpers; -// Register that points to PICA state -static constexpr Reg64 statePointer = rbp; +// The shader recompiler uses quite an odd internal ABI +// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code +// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers +// To avoid pushing/popping registers, not properly maintaining stack alignment, etc +// This generates faster recompiled code at the cost of being actively hostile against C++ interop +// To do C++ interop, we're going to have our callCppFunc function call the C++ function, and take extreme measures to ensure we don't violate +// The host ABI, such as pushing/popping our temporary registers (derp), force aligning the stack and setting up an entire stack frame, etc +// This is slow, but we do not care as we should never be calling C++ code in normal, non-debugging conditions +// The only code that might be called are helpers that are also written in assembly, for things like log2 + static constexpr Xmm scratch1 = xmm0; static constexpr Xmm scratch2 = xmm1; static constexpr Xmm src1_xmm = xmm2; static constexpr Xmm src2_xmm = xmm3; static constexpr Xmm src3_xmm = xmm4; +#if defined(PANDA3DS_MS_ABI) +// Register that points to PICA state. Must be volatile for the aforementioned reasons +static constexpr Reg64 statePointer = r8; +#elif defined(PANDA3DS_SYSV_ABI) +static constexpr Reg64 statePointer = rdi; +#else +#error Unknown ABI for x86-64 shader JIT +#endif + void ShaderEmitter::compile(const PICAShader& shaderUnit) { // Constants align(16); @@ -32,7 +49,7 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) { // We assume arg1 contains the pointer to the PICA state and arg2 a pointer to the code for the entrypoint push(statePointer); // Back up state pointer to stack. This also aligns rsp to 16 bytes for calls - mov(statePointer, (uintptr_t)&shaderUnit); // Set state pointer to the proper pointer + mov(statePointer, arg1.cvt64()); // Set state pointer to the proper pointer // If we add integer register allocations they should be pushed here, and the rsp should be properly fixed up // However most of the PICA is floating point so yeah @@ -81,26 +98,19 @@ void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) { } } -// This is the offset we need to add to rsp to peek the next return address in the callstack -// Ie the PICA PC address which, when reached, will trigger a return -size_t ShaderEmitter::getStackOffsetOfReturnPC() { - size_t ret = isWindows() ? (8 + 32) : 8; // Offset assuming 0 loop level - return ret; -} - void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { // Write current location to label for this instruction L(instructionLabels[recompilerPC]); // See if PC is a possible return PC and emit the proper code if so if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) { - const auto stackOffsetForPC = getStackOffsetOfReturnPC(); + uintptr_t stackOffsetForPC = isWindows() ? (8 + 32) : 8; Label end; // Check if return address == recompilerPC, ie if we should return cmp(dword[rsp + stackOffsetForPC], recompilerPC); - jne(end); // If not, continue with uor lives - ret(); // If yes, return + jne(end); // If not, continue with our lives + ret(); // If yes, return L(end); } @@ -793,26 +803,7 @@ void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { add(eax, 1); // The iteration count is actually uniform.x + 1 mov(dword[statePointer + loopRegOffset], ecx); // Set loop counter - // TODO: This might break if an instruction in a loop decides to yield... - push(rax); // Push loop iteration counter - push(rdx); // Push loop increment - if constexpr (isWindows()) - sub(rsp, 32); - - Label loopStart; - L(loopStart); - compileUntil(shader, dest + 1); - - const size_t stackOffsetOfLoopIncrement = isWindows() ? 32 : 0; - const size_t stackOffsetOfIterationCounter = stackOffsetOfLoopIncrement + 8; - - mov(ecx, dword[rsp + stackOffsetOfLoopIncrement]); // ecx = Loop increment - add(dword[statePointer + loopRegOffset], ecx); // Increment loop counter - sub(dword[rsp + stackOffsetOfIterationCounter], 1); // Subtract 1 from loop iteration counter - - jnz(loopStart); // Back to loop start if not over - add(rsp, isWindows() ? (16 + 32) : 16); - loopLevel--; + Helpers::panic("Unimplemented LOOP instruction"); } #endif \ No newline at end of file