mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 14:15:41 +12:00
Get some demos running on a64 shader JIT
This commit is contained in:
parent
c0621d0760
commit
fe01df588b
2 changed files with 236 additions and 42 deletions
|
@ -27,6 +27,9 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
|
|||
// Filled before compiling a shader by scanning the code for call instructions
|
||||
std::vector<u32> returnPCs;
|
||||
|
||||
// An array of 128-bit masks for blending registers together to perform masked writes.
|
||||
// Eg for writing only the x and y components, the mask is 0x00000000'00000000'FFFFFFFF'FFFF
|
||||
oaknut::Label blendMasks;
|
||||
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
|
||||
oaknut::Label onesVector;
|
||||
|
||||
|
|
|
@ -15,9 +15,37 @@ static constexpr QReg src1_vec = Q2;
|
|||
static constexpr QReg src2_vec = Q3;
|
||||
static constexpr QReg src3_vec = Q4;
|
||||
|
||||
static constexpr XReg arg1 = X0;
|
||||
static constexpr XReg arg2 = X1;
|
||||
static constexpr XReg statePointer = X9;
|
||||
|
||||
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||
oaknut::CodeBlock::unprotect(); // Unprotect the memory before writing to it
|
||||
|
||||
// Constants
|
||||
align(16);
|
||||
// Generate blending masks for doing masked writes to registers
|
||||
l(blendMasks);
|
||||
for (int i = 0; i < 16; i++) {
|
||||
dw((i & 0x8) ? 0xFFFFFFFF : 0); // Mask for x component
|
||||
dw((i & 0x4) ? 0xFFFFFFFF : 0); // Mask for y component
|
||||
dw((i & 0x2) ? 0xFFFFFFFF : 0); // Mask for z component
|
||||
dw((i & 0x1) ? 0xFFFFFFFF : 0); // Mask for w component
|
||||
}
|
||||
|
||||
// Emit prologue first
|
||||
oaknut::Label prologueLabel;
|
||||
align(16);
|
||||
|
||||
l(prologueLabel);
|
||||
prologueCb = prologueLabel.ptr<PrologueCallback>();
|
||||
|
||||
// Set state pointer to the proper pointer
|
||||
// state pointer is volatile, no need to preserve it
|
||||
MOV(statePointer, arg1);
|
||||
// Jump to code with a tail call
|
||||
BR(arg2);
|
||||
|
||||
// Scan the code for call, exp2, log2, etc instructions which need some special care
|
||||
// After that, emit exp2 and log2 functions if the corresponding instructions are present
|
||||
scanCode(shaderUnit);
|
||||
|
@ -30,6 +58,10 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
|||
recompilerPC = 0;
|
||||
loopLevel = 0;
|
||||
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
|
||||
|
||||
// Protect the memory and invalidate icache before executing the code
|
||||
oaknut::CodeBlock::protect();
|
||||
oaknut::CodeBlock::invalidate_all();
|
||||
}
|
||||
|
||||
void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
|
||||
|
@ -76,17 +108,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
|||
const u32 opcode = instruction >> 26;
|
||||
|
||||
switch (opcode) {
|
||||
// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::CMP1:
|
||||
// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::DPH:
|
||||
// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
|
||||
|
@ -97,42 +129,43 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
|||
// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::NOP:
|
||||
case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::NOP: break;
|
||||
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
|
||||
|
||||
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
|
||||
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
|
||||
// case ShaderOpcodes::EMIT:
|
||||
// case ShaderOpcodes::SETEMIT:
|
||||
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
|
||||
// emitPrintLog(shaderUnit);
|
||||
// break;
|
||||
|
||||
// case ShaderOpcodes::BREAK:
|
||||
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
|
||||
|
||||
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
|
||||
case 0x30:
|
||||
case 0x31:
|
||||
case 0x32:
|
||||
case 0x33:
|
||||
case 0x34:
|
||||
case 0x35:
|
||||
case 0x36:
|
||||
case 0x37:
|
||||
case 0x38:
|
||||
case 0x39:
|
||||
case 0x3A:
|
||||
case 0x3B:
|
||||
case 0x3C:
|
||||
case 0x3D:
|
||||
case 0x3E:
|
||||
case 0x3F:
|
||||
recMAD(shaderUnit, instruction);
|
||||
break;
|
||||
// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
|
||||
// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
|
||||
|
||||
// Unimplemented opcodes that don't seem to actually be used but exist in the binary
|
||||
// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
|
||||
// case ShaderOpcodes::EMIT:
|
||||
// case ShaderOpcodes::SETEMIT:
|
||||
// log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
|
||||
// emitPrintLog(shaderUnit);
|
||||
// break;
|
||||
|
||||
// case ShaderOpcodes::BREAK:
|
||||
// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
|
||||
|
||||
// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
|
||||
// case 0x30:
|
||||
// case 0x31:
|
||||
// case 0x32:
|
||||
// case 0x33:
|
||||
// case 0x34:
|
||||
// case 0x35:
|
||||
// case 0x36:
|
||||
// case 0x37:
|
||||
// case 0x38:
|
||||
// case 0x39:
|
||||
// case 0x3A:
|
||||
// case 0x3B:
|
||||
// case 0x3C:
|
||||
// case 0x3D:
|
||||
// case 0x3E:
|
||||
// case 0x3F: recMAD(shaderUnit, instruction); break;
|
||||
|
||||
// case ShaderOpcodes::SLT:
|
||||
// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
|
||||
|
@ -221,9 +254,165 @@ void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 des
|
|||
if (writeMask == 0xf) { // No lanes are masked, just use STR
|
||||
STR(source, statePointer, offset);
|
||||
} else {
|
||||
LDR(scratch1, statePointer, offset); // Load current source
|
||||
Helpers::panic("Unimplemented: Storing to register with blending");
|
||||
}
|
||||
LDR(scratch1, statePointer, offset); // Load current value
|
||||
LDR(scratch2, blendMasks.ptr<u8*>() + writeMask * 16); // Load write mask for blending
|
||||
|
||||
BSL(scratch2.B16(), source.B16(), scratch1.B16()); // Scratch2 = (Source & mask) | (original & ~mask)
|
||||
STR(scratch2, statePointer, offset); // Write it back
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
|
||||
INS(src1_vec.Selem()[3], WZR);
|
||||
|
||||
// Now do a full DP4
|
||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
|
||||
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
|
||||
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
||||
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
||||
}
|
||||
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
const u32 writeMask = getBits<0, 4>(operandDescriptor);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
|
||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
|
||||
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
|
||||
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
|
||||
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
||||
}
|
||||
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
FADD(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
FMAX(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
FMIN(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
|
||||
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src = getBits<12, 7>(instruction);
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
const u32 writeMask = operandDescriptor & 0xf;
|
||||
constexpr bool useAccurateRSQ = false;
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
||||
|
||||
// Compute reciprocal square root approximation
|
||||
// TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate
|
||||
// PICA RSQ uses f24 precision though, so it'll be inherently innacurate, and it's likely using an inaccurate approximation too, seeing as
|
||||
// It doesn't have regular sqrt/div instructions.
|
||||
// For now, we default to inaccurate inverse square root
|
||||
if constexpr (useAccurateRSQ) {
|
||||
FMOV(scratch1.S4(), FImm8(0x70)); // scratch1 = vec4(1.0f)
|
||||
FSQRT(src1_vec.toS(), src1_vec.toS()); // src1 = sqrt(src1), scalar
|
||||
FDIV(src1_vec.toS(), scratch1.toS(), src1_vec.toS()); // Now invert src1
|
||||
} else {
|
||||
FRSQRTE(src1_vec.toS(), src1_vec.toS()); // Much nicer
|
||||
}
|
||||
|
||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||
// Otherwise we do
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
DUP(src1_vec.S4(), src1_vec.Selem()[0]); // src1_vec = src1_vec.xxxx
|
||||
}
|
||||
|
||||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
||||
const bool isMADI = getBit<29>(instruction) == 0;
|
||||
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
|
||||
const u32 src1 = getBits<17, 5>(instruction);
|
||||
const u32 src2 = isMADI ? getBits<12, 5>(instruction) : getBits<10, 7>(instruction);
|
||||
const u32 src3 = isMADI ? getBits<5, 7>(instruction) : getBits<5, 5>(instruction);
|
||||
const u32 idx = getBits<22, 2>(instruction);
|
||||
const u32 dest = getBits<24, 5>(instruction);
|
||||
|
||||
loadRegister<1>(src1_vec, shader, src1, 0, operandDescriptor);
|
||||
loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
||||
loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
||||
|
||||
// TODO: Safe PICA multiplication
|
||||
FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
|
||||
storeRegister(src3_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
||||
|
@ -236,4 +425,6 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
|||
storeRegister(src1_vec, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { RET(); }
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue