Get some demos running on a64 shader JIT

This commit is contained in:
wheremyfoodat 2024-01-03 22:00:16 +02:00
parent c0621d0760
commit fe01df588b
2 changed files with 236 additions and 42 deletions

View file

@ -27,6 +27,9 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
// Filled before compiling a shader by scanning the code for call instructions
std::vector<u32> returnPCs;
// An array of 128-bit masks for blending registers together to perform masked writes.
// Eg for writing only the x and y components, the mask is 0x00000000'00000000'FFFFFFFF'FFFF
oaknut::Label blendMasks;
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
oaknut::Label onesVector;

View file

@ -15,9 +15,37 @@ static constexpr QReg src1_vec = Q2;
static constexpr QReg src2_vec = Q3;
static constexpr QReg src3_vec = Q4;
static constexpr XReg arg1 = X0;
static constexpr XReg arg2 = X1;
static constexpr XReg statePointer = X9;
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
oaknut::CodeBlock::unprotect(); // Unprotect the memory before writing to it
// Constants
align(16);
// Generate blending masks for doing masked writes to registers
l(blendMasks);
for (int i = 0; i < 16; i++) {
dw((i & 0x8) ? 0xFFFFFFFF : 0); // Mask for x component
dw((i & 0x4) ? 0xFFFFFFFF : 0); // Mask for y component
dw((i & 0x2) ? 0xFFFFFFFF : 0); // Mask for z component
dw((i & 0x1) ? 0xFFFFFFFF : 0); // Mask for w component
}
// Emit prologue first
oaknut::Label prologueLabel;
align(16);
l(prologueLabel);
prologueCb = prologueLabel.ptr<PrologueCallback>();
// Set state pointer to the proper pointer
// state pointer is volatile, no need to preserve it
MOV(statePointer, arg1);
// Jump to code with a tail call
BR(arg2);
// Scan the code for call, exp2, log2, etc instructions which need some special care
// After that, emit exp2 and log2 functions if the corresponding instructions are present
scanCode(shaderUnit);
@ -30,6 +58,10 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
recompilerPC = 0;
loopLevel = 0;
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
// Protect the memory and invalidate icache before executing the code
oaknut::CodeBlock::protect();
oaknut::CodeBlock::invalidate_all();
}
void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
@ -76,17 +108,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
const u32 opcode = instruction >> 26;
switch (opcode) {
// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
// case ShaderOpcodes::CMP1:
// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
// case ShaderOpcodes::DPH:
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
@ -97,42 +129,43 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
case ShaderOpcodes::NOP:
case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
case ShaderOpcodes::NOP: break;
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
// case ShaderOpcodes::EMIT:
// case ShaderOpcodes::SETEMIT:
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
// emitPrintLog(shaderUnit);
// break;
// case ShaderOpcodes::BREAK:
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
case 0x30:
case 0x31:
case 0x32:
case 0x33:
case 0x34:
case 0x35:
case 0x36:
case 0x37:
case 0x38:
case 0x39:
case 0x3A:
case 0x3B:
case 0x3C:
case 0x3D:
case 0x3E:
case 0x3F:
recMAD(shaderUnit, instruction);
break;
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
// case ShaderOpcodes::EMIT:
// case ShaderOpcodes::SETEMIT:
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
// emitPrintLog(shaderUnit);
// break;
// case ShaderOpcodes::BREAK:
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
// case 0x30:
// case 0x31:
// case 0x32:
// case 0x33:
// case 0x34:
// case 0x35:
// case 0x36:
// case 0x37:
// case 0x38:
// case 0x39:
// case 0x3A:
// case 0x3B:
// case 0x3C:
// case 0x3D:
// case 0x3E:
// case 0x3F: recMAD(shaderUnit, instruction); break;
// case ShaderOpcodes::SLT:
// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
@ -221,9 +254,165 @@ void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 des
if (writeMask == 0xf) { // No lanes are masked, just use STR
STR(source, statePointer, offset);
} else {
LDR(scratch1, statePointer, offset); // Load current source
Helpers::panic("Unimplemented: Storing to register with blending");
}
LDR(scratch1, statePointer, offset); // Load current value
LDR(scratch2, blendMasks.ptr<u8*>() + writeMask * 16); // Load write mask for blending
BSL(scratch2.B16(), source.B16(), scratch1.B16()); // Scratch2 = (Source & mask) | (original & ~mask)
STR(scratch2, statePointer, offset); // Write it back
}
}
void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
const u32 writeMask = getBits<0, 4>(operandDescriptor);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
INS(src1_vec.Selem()[3], WZR);
// Now do a full DP4
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
}
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
const u32 writeMask = getBits<0, 4>(operandDescriptor);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
}
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FADD(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FMAX(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FMIN(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src = getBits<12, 7>(instruction);
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
const u32 writeMask = operandDescriptor & 0xf;
constexpr bool useAccurateRSQ = false;
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
// Compute reciprocal square root approximation
// TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate
// PICA RSQ uses f24 precision though, so it'll be inherently innacurate, and it's likely using an inaccurate approximation too, seeing as
// It doesn't have regular sqrt/div instructions.
// For now, we default to inaccurate inverse square root
if constexpr (useAccurateRSQ) {
FMOV(scratch1.S4(), FImm8(0x70)); // scratch1 = vec4(1.0f)
FSQRT(src1_vec.toS(), src1_vec.toS()); // src1 = sqrt(src1), scalar
FDIV(src1_vec.toS(), scratch1.toS(), src1_vec.toS()); // Now invert src1
} else {
FRSQRTE(src1_vec.toS(), src1_vec.toS()); // Much nicer
}
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
// Otherwise we do
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
}
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
const bool isMADI = getBit<29>(instruction) == 0;
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
const u32 src1 = getBits<17, 5>(instruction);
const u32 src2 = isMADI ? getBits<12, 5>(instruction) : getBits<10, 7>(instruction);
const u32 src3 = isMADI ? getBits<5, 7>(instruction) : getBits<5, 5>(instruction);
const u32 idx = getBits<22, 2>(instruction);
const u32 dest = getBits<24, 5>(instruction);
loadRegister<1>(src1_vec, shader, src1, 0, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
// TODO: Safe PICA multiplication
FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
storeRegister(src3_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
@ -236,4 +425,6 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
storeRegister(src1_vec, shader, dest, operandDescriptor);
}
void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { RET(); }
#endif