mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-08 07:05:40 +12:00
[Shader JIT] Get first JIT trangle
This commit is contained in:
parent
9bb1f31fc9
commit
fd411245fa
3 changed files with 82 additions and 18 deletions
|
@ -51,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
|||
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
|
||||
|
||||
// Instruction recompilation functions
|
||||
void recADD(const PICAShader& shader, u32 instruction);
|
||||
void recDP4(const PICAShader& shader, u32 instruction);
|
||||
void recEND(const PICAShader& shader, u32 instruction);
|
||||
void recMOV(const PICAShader& shader, u32 instruction);
|
||||
|
||||
public:
|
||||
|
@ -64,6 +67,9 @@ public:
|
|||
const auto cpu = Xbyak::util::Cpu();
|
||||
|
||||
haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
|
||||
if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
|
||||
Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
|
||||
}
|
||||
}
|
||||
|
||||
void compile(const PICAShader& shaderUnit);
|
||||
|
|
|
@ -73,6 +73,18 @@ class PICAShader {
|
|||
|
||||
std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data
|
||||
|
||||
public:
|
||||
// These are placed close to the temp registers and co because it helps the JIT generate better code
|
||||
u32 entrypoint = 0; // Initial shader PC
|
||||
u32 boolUniform;
|
||||
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
|
||||
alignas(16) std::array<vec4f, 96> floatUniforms;
|
||||
|
||||
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
|
||||
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
alignas(16) std::array<vec4f, 16> outputs;
|
||||
alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT
|
||||
|
||||
protected:
|
||||
std::array<u32, 128> operandDescriptors;
|
||||
alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
|
||||
|
@ -191,15 +203,6 @@ public:
|
|||
std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
|
||||
std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
|
||||
|
||||
u32 entrypoint = 0; // Initial shader PC
|
||||
u32 boolUniform;
|
||||
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
|
||||
alignas(16) std::array<vec4f, 96> floatUniforms;
|
||||
|
||||
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
|
||||
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
alignas(16) std::array<vec4f, 16> outputs;
|
||||
|
||||
PICAShader(ShaderType type) : type(type) {}
|
||||
|
||||
// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include "PICA/dynapica/shader_rec_emitter_x64.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <bit>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace Xbyak;
|
||||
|
@ -11,7 +12,9 @@ using namespace Xbyak::util;
|
|||
static constexpr Reg64 statePointer = rbp;
|
||||
static constexpr Xmm scratch1 = xmm0;
|
||||
static constexpr Xmm scratch2 = xmm1;
|
||||
static constexpr Xmm scratch3 = xmm2;
|
||||
static constexpr Xmm src1_xmm = xmm2;
|
||||
static constexpr Xmm src2_xmm = xmm3;
|
||||
static constexpr Xmm src3_xmm = xmm4;
|
||||
|
||||
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||
// Emit prologue first
|
||||
|
@ -71,15 +74,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
|||
const u32 opcode = instruction >> 26;
|
||||
|
||||
switch (opcode) {
|
||||
case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
||||
case ShaderOpcodes::NOP: break;
|
||||
default:
|
||||
Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
|
||||
}
|
||||
}
|
||||
|
||||
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
|
||||
alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
|
||||
|
||||
if (src < 0x10)
|
||||
return shader.inputs[src];
|
||||
else if (src < 0x20)
|
||||
|
@ -88,7 +93,7 @@ const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader
|
|||
return shader.floatUniforms[src - 0x20];
|
||||
else {
|
||||
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
|
||||
return dummy;
|
||||
return shader.dummy;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -132,7 +137,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
|
|||
movaps(dest, xword[statePointer + offset]);
|
||||
else // Swizzle is not trivial so we need to emit a shuffle instruction
|
||||
pshufd(dest, xword[statePointer + offset], convertedSwizzle);
|
||||
return;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
|
@ -142,8 +147,6 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
|
|||
if (negate) {
|
||||
Helpers::panic("[ShaderJIT] Unimplemented register negation");
|
||||
}
|
||||
|
||||
Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
|
||||
}
|
||||
|
||||
void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
|
||||
|
@ -151,9 +154,22 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
|
|||
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
|
||||
|
||||
// Mask of which lanes to write
|
||||
// TODO: If only 1 lane is being written to, use movss
|
||||
u32 writeMask = operandDescriptor & 0xf;
|
||||
if (writeMask == 0xf) { // No lanes are masked, just movaps
|
||||
movaps(xword[statePointer + offset], source);
|
||||
} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
|
||||
int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
|
||||
size_t index = 3 - bit;
|
||||
const uintptr_t lane_offset = offset + index * sizeof(float);
|
||||
|
||||
if (index == 0) { // Bottom lane, no need to shift
|
||||
movss(dword[statePointer + lane_offset], source);
|
||||
} else { // Shift right by 32 * index, then write bottom lane
|
||||
movaps(scratch1, source);
|
||||
psrldq(scratch1, index * sizeof(float));
|
||||
movss(dword[statePointer + lane_offset], scratch1);
|
||||
}
|
||||
} else if (haveSSE4_1) {
|
||||
// Bit reverse the write mask because that is what blendps expects
|
||||
u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
|
||||
|
@ -176,14 +192,53 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
|
|||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) {
|
||||
// Undo anything the prologue did and return
|
||||
// Dellocate shadow stack on Windows
|
||||
if constexpr (isWindows()) {
|
||||
add(rsp, 32);
|
||||
}
|
||||
|
||||
// Restore registers
|
||||
pop(statePointer);
|
||||
ret();
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
u32 src = (instruction >> 12) & 0x7f;
|
||||
const u32 idx = (instruction >> 19) & 3;
|
||||
const u32 dest = (instruction >> 21) & 0x1f;
|
||||
|
||||
loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
||||
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||
loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
u32 src1 = (instruction >> 12) & 0x7f;
|
||||
const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
|
||||
const u32 idx = (instruction >> 19) & 3;
|
||||
const u32 dest = (instruction >> 21) & 0x1f;
|
||||
|
||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||
addps(src1_xmm, src2_xmm); // Dot product between the 2 register
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
u32 src1 = (instruction >> 12) & 0x7f;
|
||||
const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
|
||||
const u32 idx = (instruction >> 19) & 3;
|
||||
const u32 dest = (instruction >> 21) & 0x1f;
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||
dpps(src1_xmm, src2_xmm, 0b11111111); // Dot product between the 2 register, store the result in all lanes of scratch1 similarly to PICA
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue