From df414acc2322ae4260a1a11d9663bae3be88a800 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <gponiris2004@gmail.com> Date: Sat, 10 Jun 2023 14:04:15 +0300 Subject: [PATCH] [ShaderJIT] Migrate to #18 --- .../PICA/dynapica/shader_rec_emitter_x64.cpp | 143 +++++++++--------- 1 file changed, 71 insertions(+), 72 deletions(-) diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 9c00387c..a2ddc9c1 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -8,8 +8,7 @@ using namespace Xbyak; using namespace Xbyak::util; -using Helpers::getBit; -using Helpers::getBits; +using namespace Helpers; // Register that points to PICA state static constexpr Reg64 statePointer = rbp; @@ -59,8 +58,8 @@ void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) { for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) { const u32 instruction = shaderUnit.loadedShader[i]; if (isCall(instruction)) { - const u32 num = instruction & 0xff; // Num of instructions to execute - const u32 dest = (instruction >> 10) & 0xfff; // Starting subroutine address + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); const u32 returnPC = num + dest; // Add them to get the return PC returnPCs.push_back(returnPC); @@ -147,16 +146,16 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3 bool negate; // If true, negate all lanes of the register if constexpr (sourceIndex == 1) { // SRC1 - negate = ((operandDescriptor >> 4) & 1) != 0; - compSwizzle = (operandDescriptor >> 5) & 0xff; + negate = (getBit<4>(operandDescriptor)) != 0; + compSwizzle = getBits<5, 8>(operandDescriptor); } else if constexpr (sourceIndex == 2) { // SRC2 - negate = ((operandDescriptor >> 13) & 1) != 0; - compSwizzle = (operandDescriptor >> 14) & 0xff; + negate = (getBit<13>(operandDescriptor)) != 0; + compSwizzle = getBits<14, 8>(operandDescriptor); } else if constexpr (sourceIndex == 3) { // SRC3 - negate = ((operandDescriptor >> 22) & 1) != 0; - compSwizzle = (operandDescriptor >> 23) & 0xff; + negate = (getBit<22>(operandDescriptor)) != 0; + compSwizzle = getBits<23, 8>(operandDescriptor); } // PICA has the swizzle descriptor inverted in comparison to x86. For the PICA, the descriptor is (lowest to highest bits) wzyx while it's xyzw for x86 @@ -230,36 +229,36 @@ void ShaderEmitter::checkCmpRegister(const PICAShader& shader, u32 instruction) const size_t cmpRegXOffset = uintptr_t(&shader.cmpRegister) - uintptr_t(&shader); const size_t cmpRegYOffset = cmpRegXOffset + sizeof(bool); - const u32 condition = (instruction >> 22) & 3; - const uint refY = (instruction >> 24) & 1; - const uint refX = (instruction >> 25) & 1; + const u32 condition = getBits<22, 2>(instruction); + const uint refY = getBit<24>(instruction); + const uint refX = getBit<25>(instruction); // refX in the bottom byte, refY in the top byte. This is done for condition codes 0 and 1 which check both x and y, so we can emit a single instruction that checks both const u16 refX_refY_merged = refX | (refY << 8); switch (condition) { - case 0: // Either cmp register matches - // Z flag is 0 if at least 1 of them is set - test(word[statePointer + cmpRegXOffset], refX_refY_merged); + case 0: // Either cmp register matches + // Z flag is 0 if at least 1 of them is set + test(word[statePointer + cmpRegXOffset], refX_refY_merged); - // Invert z flag - setz(al); - test(al, al); - break; - case 1: // Both cmp registers match - cmp(word[statePointer + cmpRegXOffset], refX_refY_merged); - break; - case 2: // At least cmp.x matches - cmp(byte[statePointer + cmpRegXOffset], refX); - break; - default: // At least cmp.y matches - cmp(byte[statePointer + cmpRegYOffset], refY); - break; + // Invert z flag + setz(al); + test(al, al); + break; + case 1: // Both cmp registers match + cmp(word[statePointer + cmpRegXOffset], refX_refY_merged); + break; + case 2: // At least cmp.x matches + cmp(byte[statePointer + cmpRegXOffset], refX); + break; + default: // At least cmp.y matches + cmp(byte[statePointer + cmpRegYOffset], refY); + break; } } void ShaderEmitter::checkBoolUniform(const PICAShader& shader, u32 instruction) { - const u32 bit = (instruction >> 22) & 0xf; // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check const uintptr_t boolUniformOffset = uintptr_t(&shader.boolUniform) - uintptr_t(&shader); test(word[statePointer + boolUniformOffset], 1 << bit); @@ -279,9 +278,9 @@ void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src = (instruction >> 12) & 0x7f; - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 storeRegister(src1_xmm, shader, dest, operandDescriptor); @@ -289,10 +288,10 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); @@ -302,10 +301,10 @@ void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); @@ -316,10 +315,10 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); @@ -330,10 +329,10 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); @@ -343,10 +342,10 @@ void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); // TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA) loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); @@ -357,9 +356,9 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src = (instruction >> 12) & 0x7f; - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = operandDescriptor & 0xf; loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 @@ -376,9 +375,9 @@ void ShaderEmitter::recRCP(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - u32 src = (instruction >> 12) & 0x7f; - const u32 idx = (instruction >> 19) & 3; - const u32 dest = (instruction >> 21) & 0x1f; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); const u32 writeMask = operandDescriptor & 0xf; loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 @@ -395,11 +394,11 @@ void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f]; - const u32 src1 = (instruction >> 17) & 0x1f; - u32 src2 = (instruction >> 10) & 0x7f; - const u32 src3 = (instruction >> 5) & 0x1f; - const u32 idx = (instruction >> 22) & 3; - const u32 dest = (instruction >> 24) & 0x1f; + const u32 src1 = getBits<17, 5>(instruction); + const u32 src2 = getBits<10, 7>(instruction); + const u32 src3 = getBits<5, 5>(instruction); + const u32 idx = getBits<22, 2>(instruction); + const u32 dest = getBits<24, 5>(instruction); loadRegister<1>(src1_xmm, shader, src1, 0, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, idx, operandDescriptor); @@ -414,11 +413,11 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; - const u32 src1 = (instruction >> 12) & 0x7f; - const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment - const u32 idx = (instruction >> 19) & 3; - const u32 cmpY = (instruction >> 21) & 7; - const u32 cmpX = (instruction >> 24) & 7; + const u32 src1 = getBits<12, 7>(instruction); + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 idx = getBits<19, 2>(instruction); + const u32 cmpY = getBits<21, 3>(instruction); + const u32 cmpX = getBits<24, 3>(instruction); loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor); @@ -482,8 +481,8 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recIFC(const PICAShader& shader, u32 instruction) { // z is 1 if true, else 0 checkCmpRegister(shader, instruction); - const u32 dest = (instruction >> 10) & 0xfff; const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); if (dest < recompilerPC) { Helpers::warn("Shader JIT: IFC instruction with dest < current PC\n"); @@ -507,8 +506,8 @@ void ShaderEmitter::recIFC(const PICAShader& shader, u32 instruction) { void ShaderEmitter::recIFU(const PICAShader& shader, u32 instruction) { // z is 0 if true, else 1 checkBoolUniform(shader, instruction); - const u32 dest = (instruction >> 10) & 0xfff; const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); if (dest < recompilerPC) { Helpers::warn("Shader JIT: IFC instruction with dest < current PC\n"); @@ -531,8 +530,8 @@ void ShaderEmitter::recIFU(const PICAShader& shader, u32 instruction) { } void ShaderEmitter::recCALL(const PICAShader& shader, u32 instruction) { - const u32 dest = (instruction >> 10) & 0xfff; const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); // Push return PC as stack parameter. This is a decently fast solution and Citra does the same but we should probably switch to a proper PICA-like // Callstack, because it's not great to have an infinitely expanding call stack where popping from empty stack is undefined as hell