[ShaderJIT] Add MOV

This commit is contained in:
wheremyfoodat 2023-06-09 00:46:17 +03:00
parent fb11fd4440
commit 9bb1f31fc9
3 changed files with 131 additions and 20 deletions

View file

@ -5,6 +5,7 @@
#include "helpers.hpp"
#include "PICA/shader.hpp"
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
#include "x64_regs.hpp"
#include <vector>
@ -14,13 +15,20 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
static constexpr size_t allocSize = executableMemorySize + 0x1000;
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
static constexpr uint noSwizzle = 0x1B;
using f24 = Floats::f24;
using vec4f = OpenGL::Vector<f24, 4>;
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
// A vector of PCs that can potentially return based on the state of the PICA callstack.
// Filled before compiling a shader by scanning the code for call instructions
std::vector<u32> returnPCs;
u32 recompilerPC; // PC the recompiler is currently recompiling @
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1
// Compile all instructions from [current recompiler PC, end)
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
@ -35,7 +43,12 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
void scanForCalls(const PICAShader& shaderUnit);
// Load register with number "srcReg" indexed by index "idx" into the xmm register "reg"
void loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 idx);
template <int sourceIndex>
void loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor);
void storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor);
const vec4f& getSourceRef(const PICAShader& shader, u32 src);
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
// Instruction recompilation functions
void recMOV(const PICAShader& shader, u32 instruction);
@ -44,15 +57,22 @@ public:
using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
// Callback type used for the JIT prologue. This is what the caller will call
using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb);
PrologueCallback prologueCb;
PrologueCallback prologueCb = nullptr;
// Initialize our emitter with "allocSize" bytes of RWX memory
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
const auto cpu = Xbyak::util::Cpu();
haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
}
void compile(const PICAShader& shaderUnit);
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
InstructionCallback getInstructionCallback(u32 pc) {
return reinterpret_cast<InstructionCallback>(instructionLabels.at(pc).getAddress());
// Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr
uint8_t* ptr = const_cast<uint8_t*>(instructionLabels.at(pc).getAddress());
return reinterpret_cast<InstructionCallback>(ptr);
}
PrologueCallback getPrologueCallback() {

View file

@ -42,6 +42,7 @@ namespace ShaderOpcodes {
};
}
// Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
class PICAShader {
using f24 = Floats::f24;
using vec4f = OpenGL::Vector<f24, 4>;
@ -74,7 +75,7 @@ class PICAShader {
protected:
std::array<u32, 128> operandDescriptors;
std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
OpenGL::Vector<s32, 2> addrRegister; // Address register
bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
u32 loopCounter;
@ -104,10 +105,10 @@ protected:
friend class ShaderJIT;
friend class ShaderEmitter;
private:
vec4f getSource(u32 source);
vec4f& getDest(u32 dest);
private:
// Interpreter functions for the various shader functions
void add(u32 instruction);
void call(u32 instruction);
@ -193,11 +194,11 @@ public:
u32 entrypoint = 0; // Initial shader PC
u32 boolUniform;
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
std::array<vec4f, 96> floatUniforms;
alignas(16) std::array<vec4f, 96> floatUniforms;
std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
std::array<vec4f, 16> inputs; // Attributes passed to the shader
std::array<vec4f, 16> outputs;
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs;
PICAShader(ShaderType type) : type(type) {}

View file

@ -73,27 +73,117 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
switch (opcode) {
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
default:
Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode);
Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
}
}
void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 index) {
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
if (src < 0x10)
return shader.inputs[src];
else if (src < 0x20)
return shader.tempRegisters[src - 0x10];
else if (src <= 0x7f)
return shader.floatUniforms[src - 0x20];
else {
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
return dummy;
}
}
const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
if (dest < 0x10) {
return shader.outputs[dest];
} else if (dest < 0x20) {
return shader.tempRegisters[dest - 0x10];
}
Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
}
// See shader.hpp header for docs on how the swizzle and negate works
template <int sourceIndex>
void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
u32 compSwizzle; // Component swizzle pattern for the register
bool negate; // If true, negate all lanes of the register
if constexpr (sourceIndex == 1) { // SRC1
negate = ((operandDescriptor >> 4) & 1) != 0;
compSwizzle = (operandDescriptor >> 5) & 0xff;
}
else if constexpr (sourceIndex == 2) { // SRC2
negate = ((operandDescriptor >> 13) & 1) != 0;
compSwizzle = (operandDescriptor >> 14) & 0xff;
}
else if constexpr (sourceIndex == 3) { // SRC3
negate = ((operandDescriptor >> 22) & 1) != 0;
compSwizzle = (operandDescriptor >> 23) & 0xff;
}
// PICA has the swizzle descriptor inverted in comparison to x86. For the PICA, the descriptor is (lowest to highest bits) wzyx while it's xyzw for x86
u32 convertedSwizzle = ((compSwizzle >> 6) & 0b11) | (((compSwizzle >> 4) & 0b11) << 2) | (((compSwizzle >> 2) & 0b11) << 4) | ((compSwizzle & 0b11) << 6);
switch (index) {
case 0: [[likely]] { // Keep src as is, no need to offset it
const vec4f& srcRef = getSourceRef(shader, src);
const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
if (compSwizzle == noSwizzle) // Avoid emitting swizzle if not necessary
movaps(dest, xword[statePointer + offset]);
else // Swizzle is not trivial so we need to emit a shuffle instruction
pshufd(dest, xword[statePointer + offset], convertedSwizzle);
return;
}
default:
Helpers::panic("[ShaderJIT]: Unimplemented source index type");
}
if (negate) {
Helpers::panic("[ShaderJIT] Unimplemented register negation");
}
Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
}
void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
const vec4f& destRef = getDestRef(shader, dest);
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
// Mask of which lanes to write
u32 writeMask = operandDescriptor & 0xf;
if (writeMask == 0xf) { // No lanes are masked, just movaps
movaps(xword[statePointer + offset], source);
} else if (haveSSE4_1) {
// Bit reverse the write mask because that is what blendps expects
u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
movaps(scratch1, xword[statePointer + offset]); // Read current value of dest
blendps(scratch1, source, adjustedMask); // Blend with source
movaps(xword[statePointer + offset], scratch1); // Write back
} else {
// Blend algo referenced from Citra
const u8 selector = (((writeMask & 0b1000) ? 1 : 0) << 0) |
(((writeMask & 0b0100) ? 3 : 2) << 2) |
(((writeMask & 0b0010) ? 0 : 1) << 4) |
(((writeMask & 0b0001) ? 2 : 3) << 6);
movaps(scratch1, xword[statePointer + offset]);
movaps(scratch2, source);
unpckhps(scratch2, scratch1); // Unpack X/Y components of source and destination
unpcklps(scratch1, source); // Unpack Z/W components of source and destination
shufps(scratch1, scratch2, selector); // "merge-shuffle" dest and source using selecto
movaps(xword[statePointer + offset], scratch1); // Write back
}
}
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
/*
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
u32 src = (instruction >> 12) & 0x7f;
const u32 idx = (instruction >> 19) & 3;
const u32 dest = (instruction >> 21) & 0x1f;
src = getIndexedSource(src, idx);
vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor);
vec4f& destVector = getDest(dest);
u32 componentMask = operandDescriptor & 0xf;
*/
loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
storeRegister(scratch1, shader, dest, operandDescriptor);
}
#endif