[Shader JIT] Attempt to add CALLC/JMPC/JMPU/LOOP

This commit is contained in:
wheremyfoodat 2023-06-11 23:43:47 +03:00
parent 71fd8de92f
commit 137b652840
3 changed files with 107 additions and 3 deletions

View file

@ -3,6 +3,7 @@
// Only do anything if we're on an x64 target with JIT support enabled
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
#include "helpers.hpp"
#include "logger.hpp"
#include "PICA/shader.hpp"
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
@ -31,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
Label negateVector;
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1
bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX)
@ -39,6 +42,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
// Compile instruction "instr"
void compileInstruction(const PICAShader& shaderUnit);
// Get the offset to be added to the rsp register to get the current return address
size_t getStackOffsetOfReturnPC();
bool isCall(u32 instruction) {
const u32 opcode = instruction >> 26;
return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
@ -91,6 +97,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
void recSLT(const PICAShader& shader, u32 instruction);
void recSLTI(const PICAShader& shader, u32 instruction);
MAKE_LOG_FUNCTION(log, shaderJITLogger)
public:
using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
// Callback type used for the JIT prologue. This is what the caller will call

View file

@ -26,6 +26,7 @@ namespace Log {
static Logger<false> threadLogger;
static Logger<false> gpuLogger;
static Logger<false> rendererLogger;
static Logger<false> shaderJITLogger;
// Service loggers
static Logger<false> acLogger;

View file

@ -51,6 +51,7 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
// Compile every instruction in the shader
// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
recompilerPC = 0;
loopLevel = 0;
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
}
@ -78,15 +79,20 @@ void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
}
}
// This is the offset we need to add to rsp to peek the next return address in the callstack
// Ie the PICA PC address which, when reached, will trigger a return
size_t ShaderEmitter::getStackOffsetOfReturnPC() {
size_t ret = isWindows() ? (8 + 32) : 8; // Offset assuming 0 loop level
return ret;
}
void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
// Write current location to label for this instruction
L(instructionLabels[recompilerPC]);
// See if PC is a possible return PC and emit the proper code if so
if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) {
// This is the offset we need to add to rsp to peek the next return address in the callstack
// Ie the PICA PC address which, when reached, will trigger a return
const auto stackOffsetForPC = isWindows() ? (8 + 32) : 8;
const auto stackOffsetForPC = getStackOffsetOfReturnPC();
Label end;
// Check if return address == recompilerPC, ie if we should return
@ -106,6 +112,9 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
case ShaderOpcodes::CALL:
recCALL(shaderUnit, instruction);
break;
case ShaderOpcodes::CALLC:
recCALLC(shaderUnit, instruction);
break;
case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2:
recCMP(shaderUnit, instruction);
break;
@ -114,6 +123,9 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
@ -679,4 +691,87 @@ void ShaderEmitter::recCALL(const PICAShader& shader, u32 instruction) {
add(rsp, isWindows() ? (8 + 32) : 8);
}
void ShaderEmitter::recCALLC(const PICAShader& shader, u32 instruction) {
Label skipCall;
// z is 1 if the call should be taken, 0 otherwise
checkCmpRegister(shader, instruction);
jnz(skipCall);
recCALL(shader, instruction);
L(skipCall);
}
void ShaderEmitter::recJMPC(const PICAShader& shader, u32 instruction) {
const u32 dest = getBits<10, 12>(instruction);
Label& l = instructionLabels[dest];
// Z is 1 if the comparison is true
checkCmpRegister(shader, instruction);
jz(l, T_NEAR);
}
void ShaderEmitter::recJMPU(const PICAShader& shader, u32 instruction) {
bool jumpIfFalse = instruction & 1; // If the LSB is 0 we want to compare to true, otherwise compare to false
const u32 dest = getBits<10, 12>(instruction);
Label& l = instructionLabels[dest];
// Z is 0 if the uniform is true
checkBoolUniform(shader, instruction);
if (jumpIfFalse) {
jz(l, T_NEAR);
} else {
jnz(l, T_NEAR);
}
}
void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) {
const u32 dest = getBits<10, 12>(instruction);
const u32 uniformIndex = getBits<22, 2>(instruction);
if (loopLevel > 0) {
log("[Shader JIT] Detected nested loop. Might be broken?\n");
}
if (dest < recompilerPC) {
Helpers::panic("[Shader JIT] Detected backwards loop\n");
}
loopLevel++;
// Offset of the uniform
const uintptr_t uniformOffset = uintptr_t(&shader.intUniforms[uniformIndex]) - uintptr_t(&shader);
// Offset of the loop register
const uintptr_t loopRegOffset = uintptr_t(&shader.loopCounter) - uintptr_t(&shader);
movzx(eax, byte[statePointer + uniformOffset]); // eax = loop iteration count
movzx(ecx, byte[statePointer + uniformOffset + sizeof(u8)]); // ecx = initial loop counter value
movzx(edx, byte[statePointer + uniformOffset + 2 * sizeof(u8)]); // edx = loop increment
add(eax, 1); // The iteration count is actually uniform.x + 1
mov(dword[statePointer + loopRegOffset], ecx); // Set loop counter
// TODO: This might break if an instruction in a loop decides to yield...
push(rax); // Push loop iteration counter
push(rdx); // Push loop increment
if constexpr (isWindows())
sub(rsp, 32);
Label loopStart;
L(loopStart);
compileUntil(shader, dest + 1);
const size_t stackOffsetOfLoopIncrement = isWindows() ? 32 : 0;
const size_t stackOffsetOfIterationCounter = stackOffsetOfLoopIncrement + 8;
mov(ecx, dword[rsp + stackOffsetOfLoopIncrement]); // ecx = Loop increment
add(dword[statePointer + loopRegOffset], ecx); // Increment loop counter
sub(dword[rsp + stackOffsetOfIterationCounter], 1); // Subtract 1 from loop iteration counter
jnz(loopStart); // Back to loop start if not over
add(rsp, isWindows() ? (16 + 32) : 16);
loopLevel--;
}
#endif