mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-08 07:05:40 +12:00
[ShaderJIT] Add MOV
This commit is contained in:
parent
fb11fd4440
commit
9bb1f31fc9
3 changed files with 131 additions and 20 deletions
|
@ -5,6 +5,7 @@
|
|||
#include "helpers.hpp"
|
||||
#include "PICA/shader.hpp"
|
||||
#include "xbyak/xbyak.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
#include "x64_regs.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
@ -14,13 +15,20 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
|||
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
|
||||
static constexpr size_t allocSize = executableMemorySize + 0x1000;
|
||||
|
||||
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
|
||||
static constexpr uint noSwizzle = 0x1B;
|
||||
|
||||
using f24 = Floats::f24;
|
||||
using vec4f = OpenGL::Vector<f24, 4>;
|
||||
|
||||
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
|
||||
std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
|
||||
// A vector of PCs that can potentially return based on the state of the PICA callstack.
|
||||
// Filled before compiling a shader by scanning the code for call instructions
|
||||
std::vector<u32> returnPCs;
|
||||
|
||||
u32 recompilerPC; // PC the recompiler is currently recompiling @
|
||||
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
|
||||
bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1
|
||||
|
||||
// Compile all instructions from [current recompiler PC, end)
|
||||
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
||||
|
@ -35,7 +43,12 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
|||
void scanForCalls(const PICAShader& shaderUnit);
|
||||
|
||||
// Load register with number "srcReg" indexed by index "idx" into the xmm register "reg"
|
||||
void loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 idx);
|
||||
template <int sourceIndex>
|
||||
void loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor);
|
||||
void storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor);
|
||||
|
||||
const vec4f& getSourceRef(const PICAShader& shader, u32 src);
|
||||
const vec4f& getDestRef(const PICAShader& shader, u32 dest);
|
||||
|
||||
// Instruction recompilation functions
|
||||
void recMOV(const PICAShader& shader, u32 instruction);
|
||||
|
@ -44,15 +57,22 @@ public:
|
|||
using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
|
||||
// Callback type used for the JIT prologue. This is what the caller will call
|
||||
using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb);
|
||||
PrologueCallback prologueCb;
|
||||
PrologueCallback prologueCb = nullptr;
|
||||
|
||||
// Initialize our emitter with "allocSize" bytes of RWX memory
|
||||
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
|
||||
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
|
||||
const auto cpu = Xbyak::util::Cpu();
|
||||
|
||||
haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
|
||||
}
|
||||
|
||||
void compile(const PICAShader& shaderUnit);
|
||||
|
||||
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
|
||||
InstructionCallback getInstructionCallback(u32 pc) {
|
||||
return reinterpret_cast<InstructionCallback>(instructionLabels.at(pc).getAddress());
|
||||
// Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr
|
||||
uint8_t* ptr = const_cast<uint8_t*>(instructionLabels.at(pc).getAddress());
|
||||
return reinterpret_cast<InstructionCallback>(ptr);
|
||||
}
|
||||
|
||||
PrologueCallback getPrologueCallback() {
|
||||
|
|
|
@ -42,6 +42,7 @@ namespace ShaderOpcodes {
|
|||
};
|
||||
}
|
||||
|
||||
// Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
|
||||
class PICAShader {
|
||||
using f24 = Floats::f24;
|
||||
using vec4f = OpenGL::Vector<f24, 4>;
|
||||
|
@ -74,7 +75,7 @@ class PICAShader {
|
|||
|
||||
protected:
|
||||
std::array<u32, 128> operandDescriptors;
|
||||
std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
|
||||
alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
|
||||
OpenGL::Vector<s32, 2> addrRegister; // Address register
|
||||
bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
|
||||
u32 loopCounter;
|
||||
|
@ -104,10 +105,10 @@ protected:
|
|||
friend class ShaderJIT;
|
||||
friend class ShaderEmitter;
|
||||
|
||||
private:
|
||||
vec4f getSource(u32 source);
|
||||
vec4f& getDest(u32 dest);
|
||||
|
||||
private:
|
||||
// Interpreter functions for the various shader functions
|
||||
void add(u32 instruction);
|
||||
void call(u32 instruction);
|
||||
|
@ -193,11 +194,11 @@ public:
|
|||
u32 entrypoint = 0; // Initial shader PC
|
||||
u32 boolUniform;
|
||||
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
|
||||
std::array<vec4f, 96> floatUniforms;
|
||||
alignas(16) std::array<vec4f, 96> floatUniforms;
|
||||
|
||||
std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
|
||||
std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
std::array<vec4f, 16> outputs;
|
||||
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
|
||||
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
alignas(16) std::array<vec4f, 16> outputs;
|
||||
|
||||
PICAShader(ShaderType type) : type(type) {}
|
||||
|
||||
|
|
|
@ -73,27 +73,117 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
|||
switch (opcode) {
|
||||
case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
|
||||
default:
|
||||
Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode);
|
||||
Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 index) {
|
||||
const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
|
||||
alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
|
||||
|
||||
if (src < 0x10)
|
||||
return shader.inputs[src];
|
||||
else if (src < 0x20)
|
||||
return shader.tempRegisters[src - 0x10];
|
||||
else if (src <= 0x7f)
|
||||
return shader.floatUniforms[src - 0x20];
|
||||
else {
|
||||
Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
|
||||
return dummy;
|
||||
}
|
||||
}
|
||||
|
||||
const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
|
||||
if (dest < 0x10) {
|
||||
return shader.outputs[dest];
|
||||
} else if (dest < 0x20) {
|
||||
return shader.tempRegisters[dest - 0x10];
|
||||
}
|
||||
Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
|
||||
}
|
||||
|
||||
// See shader.hpp header for docs on how the swizzle and negate works
|
||||
template <int sourceIndex>
|
||||
void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
|
||||
u32 compSwizzle; // Component swizzle pattern for the register
|
||||
bool negate; // If true, negate all lanes of the register
|
||||
|
||||
if constexpr (sourceIndex == 1) { // SRC1
|
||||
negate = ((operandDescriptor >> 4) & 1) != 0;
|
||||
compSwizzle = (operandDescriptor >> 5) & 0xff;
|
||||
}
|
||||
else if constexpr (sourceIndex == 2) { // SRC2
|
||||
negate = ((operandDescriptor >> 13) & 1) != 0;
|
||||
compSwizzle = (operandDescriptor >> 14) & 0xff;
|
||||
}
|
||||
else if constexpr (sourceIndex == 3) { // SRC3
|
||||
negate = ((operandDescriptor >> 22) & 1) != 0;
|
||||
compSwizzle = (operandDescriptor >> 23) & 0xff;
|
||||
}
|
||||
|
||||
// PICA has the swizzle descriptor inverted in comparison to x86. For the PICA, the descriptor is (lowest to highest bits) wzyx while it's xyzw for x86
|
||||
u32 convertedSwizzle = ((compSwizzle >> 6) & 0b11) | (((compSwizzle >> 4) & 0b11) << 2) | (((compSwizzle >> 2) & 0b11) << 4) | ((compSwizzle & 0b11) << 6);
|
||||
|
||||
switch (index) {
|
||||
case 0: [[likely]] { // Keep src as is, no need to offset it
|
||||
const vec4f& srcRef = getSourceRef(shader, src);
|
||||
const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
|
||||
|
||||
if (compSwizzle == noSwizzle) // Avoid emitting swizzle if not necessary
|
||||
movaps(dest, xword[statePointer + offset]);
|
||||
else // Swizzle is not trivial so we need to emit a shuffle instruction
|
||||
pshufd(dest, xword[statePointer + offset], convertedSwizzle);
|
||||
return;
|
||||
}
|
||||
|
||||
default:
|
||||
Helpers::panic("[ShaderJIT]: Unimplemented source index type");
|
||||
}
|
||||
|
||||
if (negate) {
|
||||
Helpers::panic("[ShaderJIT] Unimplemented register negation");
|
||||
}
|
||||
|
||||
Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
|
||||
}
|
||||
|
||||
void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
|
||||
const vec4f& destRef = getDestRef(shader, dest);
|
||||
const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
|
||||
|
||||
// Mask of which lanes to write
|
||||
u32 writeMask = operandDescriptor & 0xf;
|
||||
if (writeMask == 0xf) { // No lanes are masked, just movaps
|
||||
movaps(xword[statePointer + offset], source);
|
||||
} else if (haveSSE4_1) {
|
||||
// Bit reverse the write mask because that is what blendps expects
|
||||
u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
|
||||
movaps(scratch1, xword[statePointer + offset]); // Read current value of dest
|
||||
blendps(scratch1, source, adjustedMask); // Blend with source
|
||||
movaps(xword[statePointer + offset], scratch1); // Write back
|
||||
} else {
|
||||
// Blend algo referenced from Citra
|
||||
const u8 selector = (((writeMask & 0b1000) ? 1 : 0) << 0) |
|
||||
(((writeMask & 0b0100) ? 3 : 2) << 2) |
|
||||
(((writeMask & 0b0010) ? 0 : 1) << 4) |
|
||||
(((writeMask & 0b0001) ? 2 : 3) << 6);
|
||||
|
||||
movaps(scratch1, xword[statePointer + offset]);
|
||||
movaps(scratch2, source);
|
||||
unpckhps(scratch2, scratch1); // Unpack X/Y components of source and destination
|
||||
unpcklps(scratch1, source); // Unpack Z/W components of source and destination
|
||||
shufps(scratch1, scratch2, selector); // "merge-shuffle" dest and source using selecto
|
||||
movaps(xword[statePointer + offset], scratch1); // Write back
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
|
||||
/*
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
u32 src = (instruction >> 12) & 0x7f;
|
||||
const u32 idx = (instruction >> 19) & 3;
|
||||
const u32 dest = (instruction >> 21) & 0x1f;
|
||||
|
||||
src = getIndexedSource(src, idx);
|
||||
vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor);
|
||||
vec4f& destVector = getDest(dest);
|
||||
|
||||
u32 componentMask = operandDescriptor & 0xf;
|
||||
*/
|
||||
loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
|
||||
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue