diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp index 192deb43..e2ff59f5 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp @@ -128,8 +128,8 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { switch (opcode) { case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break; case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break; - // case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break; - // case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break; + case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break; + case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break; case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break; case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break; @@ -138,13 +138,13 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { // case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break; case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; // case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break; - // case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break; + case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break; case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break; case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break; case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break; - // case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break; + case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break; // case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break; - // case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break; + case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break; case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break; case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break; @@ -156,14 +156,11 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { // Unimplemented opcodes that don't seem to actually be used but exist in the binary // EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders? - // case ShaderOpcodes::EMIT: - // case ShaderOpcodes::SETEMIT: - // log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode); - // emitPrintLog(shaderUnit); - // break; + case ShaderOpcodes::EMIT: + case ShaderOpcodes::SETEMIT: log("[ShaderJIT] Unimplemented PICA opcode: %02X\n", opcode); break; - // case ShaderOpcodes::BREAK: - // case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break; + case ShaderOpcodes::BREAK: + case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break; // We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD case 0x30: @@ -181,15 +178,13 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { case 0x3C: case 0x3D: case 0x3E: - case 0x3F: - recMAD(shaderUnit, instruction); - break; + case 0x3F: recMAD(shaderUnit, instruction); break; - // case ShaderOpcodes::SLT: - // case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break; + case ShaderOpcodes::SLT: + case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break; - // case ShaderOpcodes::SGE: - // case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break; + case ShaderOpcodes::SGE: + case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break; default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode); } @@ -430,6 +425,17 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { storeRegister(src1_vec, shader, dest, operandDescriptor); } +void ShaderEmitter::recFLR(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 + FRINTM(src1_vec.S4(), src1_vec.S4()); // Floor it and store into dest + storeRegister(src1_vec, shader, dest, operandDescriptor); +} + void ShaderEmitter::recMOVA(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src = getBits<12, 7>(instruction); @@ -630,6 +636,42 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { storeRegister(src3_vec, shader, dest, operandDescriptor); } +void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) { + const bool isSLTI = (instruction >> 26) == ShaderOpcodes::SLTI; + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + + const u32 src1 = isSLTI ? getBits<14, 5>(instruction) : getBits<12, 7>(instruction); + const u32 src2 = isSLTI ? getBits<7, 7>(instruction) : getBits<7, 5>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1_vec, shader, src1, isSLTI ? 0 : idx, operandDescriptor); + loadRegister<2>(src2_vec, shader, src2, isSLTI ? idx : 0, operandDescriptor); + // Set each lane of SRC1 to FFFFFFFF if src2 > src1, else to 0. NEON does not have FCMLT so we use FCMGT with inverted operands + // This is more or less a direct port of the relevant x64 JIT code + FCMGT(src1_vec.S4(), src2_vec.S4(), src1_vec.S4()); + AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 + storeRegister(src1_vec, shader, dest, operandDescriptor); +} + +void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) { + const bool isSGEI = (instruction >> 26) == ShaderOpcodes::SGEI; + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + + const u32 src1 = isSGEI ? getBits<14, 5>(instruction) : getBits<12, 7>(instruction); + const u32 src2 = isSGEI ? getBits<7, 7>(instruction) : getBits<7, 5>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1_vec, shader, src1, isSGEI ? 0 : idx, operandDescriptor); + loadRegister<2>(src2_vec, shader, src2, isSGEI ? idx : 0, operandDescriptor); + // Set each lane of SRC1 to FFFFFFFF if src1 >= src2, else to 0. + // This is more or less a direct port of the relevant x64 JIT code + FCMGE(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); + AND(src1_vec.B16(), src1_vec.B16(), onesVector.B16()); // AND with vec4(1.0) to convert the FFFFFFFF lanes into 1.0 + storeRegister(src1_vec, shader, dest, operandDescriptor); +} + void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) { const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); @@ -742,6 +784,28 @@ void ShaderEmitter::recCALL(const PICAShader& shader, u32 instruction) { LDP(XZR, X30, SP, POST_INDEXED, 16); } +void ShaderEmitter::recCALLC(const PICAShader& shader, u32 instruction) { + Label skipCall; + + // z is 1 if the call should be taken, 0 otherwise + checkCmpRegister(shader, instruction); + B(NE, skipCall); + recCALL(shader, instruction); + + l(skipCall); +} + +void ShaderEmitter::recCALLU(const PICAShader& shader, u32 instruction) { + Label skipCall; + + // z is 0 if the call should be taken, 1 otherwise + checkBoolUniform(shader, instruction); + B(EQ, skipCall); + recCALL(shader, instruction); + + l(skipCall); +} + void ShaderEmitter::recIFC(const PICAShader& shader, u32 instruction) { // z is 1 if true, else 0 checkCmpRegister(shader, instruction); @@ -801,10 +865,83 @@ void ShaderEmitter::recJMPC(const PICAShader& shader, u32 instruction) { B(EQ, l); } +void ShaderEmitter::recJMPU(const PICAShader& shader, u32 instruction) { + bool jumpIfFalse = instruction & 1; // If the LSB is 0 we want to compare to true, otherwise compare to false + const u32 dest = getBits<10, 12>(instruction); + + Label& l = instructionLabels[dest]; + // Z is 0 if the uniform is true + checkBoolUniform(shader, instruction); + + if (jumpIfFalse) { + B(EQ, l); + } else { + B(NE, l); + } +} + +void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) { + const u32 dest = getBits<10, 12>(instruction); + const u32 uniformIndex = getBits<22, 2>(instruction); + + if (loopLevel > 0) { + log("[Shader JIT] Detected nested loop. Might be broken?\n"); + } + + if (dest < recompilerPC) { + Helpers::panic("[Shader JIT] Detected backwards loop\n"); + } + + loopLevel++; + + // Offset of the uniform + const auto& uniform = shader.intUniforms[uniformIndex]; + const uintptr_t uniformOffset = uintptr_t(&uniform[0]) - uintptr_t(&shader); + // Offset of the loop register + const uintptr_t loopRegOffset = uintptr_t(&shader.loopCounter) - uintptr_t(&shader); + + LDRB(W0, statePointer, uniformOffset); // W0 = loop iteration count + LDRB(W1, statePointer, uniformOffset + sizeof(u8)); // W1 = initial loop counter value + LDRB(W2, statePointer, uniformOffset + 2 * sizeof(u8)); // W2 = Loop increment + + ADD(W0, W0, 1); // The iteration count is actually uniform.x + 1 + STR(W1, statePointer, loopRegOffset); // Set loop counter + + // Push loop iteration counter & loop increment + // TODO: This might break if an instruction in a loop decides to yield... + STP(X0, X2, SP, PRE_INDEXED, -16); + + Label loopStart, loopEnd; + l(loopStart); + compileUntil(shader, dest + 1); + + const size_t stackOffsetOfLoopIncrement = 0; + const size_t stackOffsetOfIterationCounter = stackOffsetOfLoopIncrement + 8; + + LDP(X0, X2, SP); // W0 = loop iteration, W2 = loop increment + LDR(W1, statePointer, loopRegOffset); // W1 = loop register + + // Increment loop counter + ADD(W1, W1, W2); + STR(W1, statePointer, loopRegOffset); + // Subtract 1 from loop iteration counter, + SUBS(W0, W0, 1); + B(EQ, loopEnd); + + // Loop hasn't ended: Write back new iteration counter and go back to the start + STR(X0, SP); + B(loopStart); + + l(loopEnd); + // Remove the stuff we pushed on the stack earlier + ADD(SP, SP, 16); + loopLevel--; +} + void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { // Fetch original LR and return. This also restores SP to its original value, discarding the return guard into XZR LDP(XZR, X30, SP, POST_INDEXED, 16); RET(); } -#endif \ No newline at end of file +#endif