From fd411245fa6638c19ff08f0640e52c528566e86f Mon Sep 17 00:00:00 2001 From: wheremyfoodat Date: Fri, 9 Jun 2023 02:28:59 +0300 Subject: [PATCH] [Shader JIT] Get first JIT trangle --- .../PICA/dynapica/shader_rec_emitter_x64.hpp | 6 ++ include/PICA/shader.hpp | 21 +++--- .../PICA/dynapica/shader_rec_emitter_x64.cpp | 73 ++++++++++++++++--- 3 files changed, 82 insertions(+), 18 deletions(-) diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 5c3c403c..27d1865f 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -51,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator { const vec4f& getDestRef(const PICAShader& shader, u32 dest); // Instruction recompilation functions + void recADD(const PICAShader& shader, u32 instruction); + void recDP4(const PICAShader& shader, u32 instruction); + void recEND(const PICAShader& shader, u32 instruction); void recMOV(const PICAShader& shader, u32 instruction); public: @@ -64,6 +67,9 @@ public: const auto cpu = Xbyak::util::Cpu(); haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41); + if (!cpu.has(Xbyak::util::Cpu::tSSE3)) { + Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead"); + } } void compile(const PICAShader& shaderUnit); diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index 284438bf..bf550f41 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -73,6 +73,18 @@ class PICAShader { std::array floatUniformBuffer; // Buffer for temporarily caching float uniform data +public: + // These are placed close to the temp registers and co because it helps the JIT generate better code + u32 entrypoint = 0; // Initial shader PC + u32 boolUniform; + std::array, 4> intUniforms; + alignas(16) std::array floatUniforms; + + alignas(16) std::array fixedAttributes; // Fixed vertex attributes + alignas(16) std::array inputs; // Attributes passed to the shader + alignas(16) std::array outputs; + alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT + protected: std::array operandDescriptors; alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values @@ -191,15 +203,6 @@ public: std::array loadedShader; // Currently loaded & active shader std::array bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to - u32 entrypoint = 0; // Initial shader PC - u32 boolUniform; - std::array, 4> intUniforms; - alignas(16) std::array floatUniforms; - - alignas(16) std::array fixedAttributes; // Fixed vertex attributes - alignas(16) std::array inputs; // Attributes passed to the shader - alignas(16) std::array outputs; - PICAShader(ShaderType type) : type(type) {} // Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index d06b9db2..219eb3d7 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -2,6 +2,7 @@ #include "PICA/dynapica/shader_rec_emitter_x64.hpp" #include +#include #include using namespace Xbyak; @@ -11,7 +12,9 @@ using namespace Xbyak::util; static constexpr Reg64 statePointer = rbp; static constexpr Xmm scratch1 = xmm0; static constexpr Xmm scratch2 = xmm1; -static constexpr Xmm scratch3 = xmm2; +static constexpr Xmm src1_xmm = xmm2; +static constexpr Xmm src2_xmm = xmm3; +static constexpr Xmm src3_xmm = xmm4; void ShaderEmitter::compile(const PICAShader& shaderUnit) { // Emit prologue first @@ -71,15 +74,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { const u32 opcode = instruction >> 26; switch (opcode) { + case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break; + case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break; + case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; + case ShaderOpcodes::NOP: break; default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode); } } const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) { - alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); - if (src < 0x10) return shader.inputs[src]; else if (src < 0x20) @@ -88,7 +93,7 @@ const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader return shader.floatUniforms[src - 0x20]; else { Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src); - return dummy; + return shader.dummy; } } @@ -132,7 +137,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 movaps(dest, xword[statePointer + offset]); else // Swizzle is not trivial so we need to emit a shuffle instruction pshufd(dest, xword[statePointer + offset], convertedSwizzle); - return; + break; } default: @@ -142,8 +147,6 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 if (negate) { Helpers::panic("[ShaderJIT] Unimplemented register negation"); } - - Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource"); } void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) { @@ -151,9 +154,22 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct // Mask of which lanes to write + // TODO: If only 1 lane is being written to, use movss u32 writeMask = operandDescriptor & 0xf; if (writeMask == 0xf) { // No lanes are masked, just movaps movaps(xword[statePointer + offset], source); + } else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss + int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc) + size_t index = 3 - bit; + const uintptr_t lane_offset = offset + index * sizeof(float); + + if (index == 0) { // Bottom lane, no need to shift + movss(dword[statePointer + lane_offset], source); + } else { // Shift right by 32 * index, then write bottom lane + movaps(scratch1, source); + psrldq(scratch1, index * sizeof(float)); + movss(dword[statePointer + lane_offset], scratch1); + } } else if (haveSSE4_1) { // Bit reverse the write mask because that is what blendps expects u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000); @@ -176,14 +192,53 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest } } +void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { + // Undo anything the prologue did and return + // Dellocate shadow stack on Windows + if constexpr (isWindows()) { + add(rsp, 32); + } + + // Restore registers + pop(statePointer); + ret(); +} + void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; u32 src = (instruction >> 12) & 0x7f; const u32 idx = (instruction >> 19) & 3; const u32 dest = (instruction >> 21) & 0x1f; - loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 - storeRegister(scratch1, shader, dest, operandDescriptor); + loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 + storeRegister(src1_xmm, shader, dest, operandDescriptor); +} + +void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + u32 src1 = (instruction >> 12) & 0x7f; + const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment + const u32 idx = (instruction >> 19) & 3; + const u32 dest = (instruction >> 21) & 0x1f; + + loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); + addps(src1_xmm, src2_xmm); // Dot product between the 2 register + storeRegister(src1_xmm, shader, dest, operandDescriptor); +} + +void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + u32 src1 = (instruction >> 12) & 0x7f; + const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment + const u32 idx = (instruction >> 19) & 3; + const u32 dest = (instruction >> 21) & 0x1f; + + // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) + loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); + loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); + dpps(src1_xmm, src2_xmm, 0b11111111); // Dot product between the 2 register, store the result in all lanes of scratch1 similarly to PICA + storeRegister(src1_xmm, shader, dest, operandDescriptor); } #endif \ No newline at end of file