[Shader JIT] Get first JIT trangle

This commit is contained in:
wheremyfoodat 2023-06-09 02:28:59 +03:00
parent 9bb1f31fc9
commit fd411245fa
3 changed files with 82 additions and 18 deletions

View file

@ -51,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
// Instruction recompilation functions
void recADD(const PICAShader& shader, u32 instruction);
void recDP4(const PICAShader& shader, u32 instruction);
void recEND(const PICAShader& shader, u32 instruction);
void recMOV(const PICAShader& shader, u32 instruction);
public:
@ -64,6 +67,9 @@ public:
const auto cpu = Xbyak::util::Cpu();
haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
}
}
void compile(const PICAShader& shaderUnit);

View file

@ -73,6 +73,18 @@ class PICAShader {
std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data
public:
// These are placed close to the temp registers and co because it helps the JIT generate better code
u32 entrypoint = 0; // Initial shader PC
u32 boolUniform;
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
alignas(16) std::array<vec4f, 96> floatUniforms;
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs;
alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT
protected:
std::array<u32, 128> operandDescriptors;
alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
@ -191,15 +203,6 @@ public:
std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
u32 entrypoint = 0; // Initial shader PC
u32 boolUniform;
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
alignas(16) std::array<vec4f, 96> floatUniforms;
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs;
PICAShader(ShaderType type) : type(type) {}
// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them

View file

@ -2,6 +2,7 @@
#include "PICA/dynapica/shader_rec_emitter_x64.hpp"
#include <algorithm>
#include <bit>
#include <cstddef>
using namespace Xbyak;
@ -11,7 +12,9 @@ using namespace Xbyak::util;
static constexpr Reg64 statePointer = rbp;
static constexpr Xmm scratch1 = xmm0;
static constexpr Xmm scratch2 = xmm1;
static constexpr Xmm scratch3 = xmm2;
static constexpr Xmm src1_xmm = xmm2;
static constexpr Xmm src2_xmm = xmm3;
static constexpr Xmm src3_xmm = xmm4;
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
// Emit prologue first
@ -71,15 +74,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
const u32 opcode = instruction >> 26;
switch (opcode) {
case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
case ShaderOpcodes::NOP: break;
default:
Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
}
}
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
if (src < 0x10)
return shader.inputs[src];
else if (src < 0x20)
@ -88,7 +93,7 @@ const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader
return shader.floatUniforms[src - 0x20];
else {
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
return dummy;
return shader.dummy;
}
}
@ -132,7 +137,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
movaps(dest, xword[statePointer + offset]);
else // Swizzle is not trivial so we need to emit a shuffle instruction
pshufd(dest, xword[statePointer + offset], convertedSwizzle);
return;
break;
}
default:
@ -142,8 +147,6 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
if (negate) {
Helpers::panic("[ShaderJIT] Unimplemented register negation");
}
Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
}
void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
@ -151,9 +154,22 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
// Mask of which lanes to write
// TODO: If only 1 lane is being written to, use movss
u32 writeMask = operandDescriptor & 0xf;
if (writeMask == 0xf) { // No lanes are masked, just movaps
movaps(xword[statePointer + offset], source);
} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
size_t index = 3 - bit;
const uintptr_t lane_offset = offset + index * sizeof(float);
if (index == 0) { // Bottom lane, no need to shift
movss(dword[statePointer + lane_offset], source);
} else { // Shift right by 32 * index, then write bottom lane
movaps(scratch1, source);
psrldq(scratch1, index * sizeof(float));
movss(dword[statePointer + lane_offset], scratch1);
}
} else if (haveSSE4_1) {
// Bit reverse the write mask because that is what blendps expects
u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
@ -176,14 +192,53 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
}
}
void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) {
// Undo anything the prologue did and return
// Dellocate shadow stack on Windows
if constexpr (isWindows()) {
add(rsp, 32);
}
// Restore registers
pop(statePointer);
ret();
}
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
u32 src = (instruction >> 12) & 0x7f;
const u32 idx = (instruction >> 19) & 3;
const u32 dest = (instruction >> 21) & 0x1f;
loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
storeRegister(scratch1, shader, dest, operandDescriptor);
loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
u32 src1 = (instruction >> 12) & 0x7f;
const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
const u32 idx = (instruction >> 19) & 3;
const u32 dest = (instruction >> 21) & 0x1f;
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
addps(src1_xmm, src2_xmm); // Dot product between the 2 register
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
u32 src1 = (instruction >> 12) & 0x7f;
const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
const u32 idx = (instruction >> 19) & 3;
const u32 dest = (instruction >> 21) & 0x1f;
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
dpps(src1_xmm, src2_xmm, 0b11111111); // Dot product between the 2 register, store the result in all lanes of scratch1 similarly to PICA
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
#endif