mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-24 22:35:51 +12:00
[Shader JIT] Add prologue & some more compilation stuffs
This commit is contained in:
parent
415e276ef9
commit
77cba3110d
5 changed files with 116 additions and 7 deletions
|
@ -15,10 +15,10 @@ class ShaderJIT {
|
||||||
#ifdef PANDA3DS_SHADER_JIT_SUPPORTED
|
#ifdef PANDA3DS_SHADER_JIT_SUPPORTED
|
||||||
using Hash = PICAShader::Hash;
|
using Hash = PICAShader::Hash;
|
||||||
using ShaderCache = std::unordered_map<Hash, std::unique_ptr<ShaderEmitter>>;
|
using ShaderCache = std::unordered_map<Hash, std::unique_ptr<ShaderEmitter>>;
|
||||||
ShaderEmitter::Callback activeShaderCallback;
|
ShaderEmitter::PrologueCallback prologueCallback;
|
||||||
|
ShaderEmitter::InstructionCallback entrypointCallback;
|
||||||
|
|
||||||
ShaderCache cache;
|
ShaderCache cache;
|
||||||
void compileShader(PICAShader& shaderUnit);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -26,8 +26,12 @@ public:
|
||||||
// Call this before starting to process a batch of vertices
|
// Call this before starting to process a batch of vertices
|
||||||
// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
|
// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
|
||||||
// If yes, it sets it as the active shader. if not, then it compiles it, adds it to the cache, and sets it as active,
|
// If yes, it sets it as the active shader. if not, then it compiles it, adds it to the cache, and sets it as active,
|
||||||
|
// The caller must make sure the entrypoint has been properly set beforehand
|
||||||
void prepare(PICAShader& shaderUnit);
|
void prepare(PICAShader& shaderUnit);
|
||||||
void reset();
|
void reset();
|
||||||
|
void run(PICAShader& shaderUnit) {
|
||||||
|
prologueCallback(shaderUnit, entrypointCallback);
|
||||||
|
}
|
||||||
|
|
||||||
static constexpr bool isAvailable() { return true; }
|
static constexpr bool isAvailable() { return true; }
|
||||||
#else
|
#else
|
||||||
|
@ -42,6 +46,4 @@ public:
|
||||||
void reset() {}
|
void reset() {}
|
||||||
static constexpr bool isAvailable() { return false; }
|
static constexpr bool isAvailable() { return false; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
auto getCallback() { return activeShaderCallback; }
|
|
||||||
};
|
};
|
|
@ -7,17 +7,51 @@
|
||||||
#include "xbyak/xbyak.h"
|
#include "xbyak/xbyak.h"
|
||||||
#include "x64_regs.hpp"
|
#include "x64_regs.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
class ShaderEmitter : public Xbyak::CodeGenerator {
|
class ShaderEmitter : public Xbyak::CodeGenerator {
|
||||||
static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
|
static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
|
||||||
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
|
// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
|
||||||
static constexpr size_t allocSize = executableMemorySize + 0x1000;
|
static constexpr size_t allocSize = executableMemorySize + 0x1000;
|
||||||
|
|
||||||
|
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
|
||||||
|
std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
|
||||||
|
// A vector of PCs that can potentially return based on the state of the PICA callstack.
|
||||||
|
// Filled before compiling a shader by scanning the code for call instructions
|
||||||
|
std::vector<u32> returnPCs;
|
||||||
|
|
||||||
|
u32 recompilerPC; // PC the recompiler is currently recompiling @
|
||||||
|
|
||||||
|
// Compile all instructions from [current recompiler PC, end)
|
||||||
|
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
||||||
|
// Compile instruction "instr"
|
||||||
|
void compileInstruction(const PICAShader& shaderUnit);
|
||||||
|
|
||||||
|
bool isCall(u32 instruction) {
|
||||||
|
const u32 opcode = instruction >> 26;
|
||||||
|
return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
|
||||||
|
}
|
||||||
|
// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
|
||||||
|
void scanForCalls(const PICAShader& shader);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
using Callback = void(*)(const PICAShader& shaderUnit);
|
using InstructionCallback = void(*)(PICAShader& shaderUnit); // Callback type used for instructions
|
||||||
|
// Callback type used for the JIT prologue. This is what the caller will call
|
||||||
|
using PrologueCallback = void(*)(PICAShader& shaderUnit, InstructionCallback cb);
|
||||||
|
PrologueCallback prologueCb;
|
||||||
|
|
||||||
// Initialize our emitter with "allocSize" bytes of RWX memory
|
// Initialize our emitter with "allocSize" bytes of RWX memory
|
||||||
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
|
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
|
||||||
void compile(const PICAShader& shaderUnit);
|
void compile(const PICAShader& shaderUnit);
|
||||||
|
|
||||||
|
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
|
||||||
|
InstructionCallback getInstructionCallback(u32 pc) {
|
||||||
|
return reinterpret_cast<InstructionCallback>(instructionLabels.at(pc).getAddress());
|
||||||
|
}
|
||||||
|
|
||||||
|
PrologueCallback getPrologueCallback() {
|
||||||
|
return prologueCb;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // x64 recompiler check
|
#endif // x64 recompiler check
|
|
@ -17,9 +17,15 @@ void ShaderJIT::prepare(PICAShader& shaderUnit) {
|
||||||
if (it == cache.end()) { // Block has not been compiled yet
|
if (it == cache.end()) { // Block has not been compiled yet
|
||||||
auto emitter = std::make_unique<ShaderEmitter>();
|
auto emitter = std::make_unique<ShaderEmitter>();
|
||||||
emitter->compile(shaderUnit);
|
emitter->compile(shaderUnit);
|
||||||
|
// Get pointer to callbacks
|
||||||
|
entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint);
|
||||||
|
prologueCallback = emitter->getPrologueCallback();
|
||||||
|
|
||||||
cache.emplace_hint(it, hash, std::move(emitter));
|
cache.emplace_hint(it, hash, std::move(emitter));
|
||||||
} else { // Block has been compiled and found, use it
|
} else { // Block has been compiled and found, use it
|
||||||
auto emitter = it->second.get();
|
auto emitter = it->second.get();
|
||||||
|
entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint);
|
||||||
|
prologueCallback = emitter->getPrologueCallback();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // PANDA3DS_SHADER_JIT_SUPPORTED
|
#endif // PANDA3DS_SHADER_JIT_SUPPORTED
|
|
@ -4,8 +4,70 @@
|
||||||
using namespace Xbyak;
|
using namespace Xbyak;
|
||||||
using namespace Xbyak::util;
|
using namespace Xbyak::util;
|
||||||
|
|
||||||
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
// Register that points to PICA state
|
||||||
|
static constexpr Reg64 statePointer = rbp;
|
||||||
|
|
||||||
|
void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||||
|
// Emit prologue first
|
||||||
|
align(16);
|
||||||
|
prologueCb = getCurr<PrologueCallback>();
|
||||||
|
|
||||||
|
// We assume arg1 contains the pointer to the PICA state and arg2 a pointer to the code for the entrypoint
|
||||||
|
push(statePointer); // Back up state pointer to stack. This also aligns rsp to 16 bytes for calls
|
||||||
|
mov(statePointer, (uintptr_t)&shaderUnit); // Set state pointer to the proper pointer
|
||||||
|
|
||||||
|
// If we add integer register allocations they should be pushed here, and the rsp should be properly fixed up
|
||||||
|
// However most of the PICA is floating point so yeah
|
||||||
|
|
||||||
|
// Allocate shadow stack on Windows
|
||||||
|
if constexpr (isWindows()) {
|
||||||
|
sub(rsp, 32);
|
||||||
|
}
|
||||||
|
// Tail call to shader code entrypoint
|
||||||
|
jmp(arg2);
|
||||||
|
align(16);
|
||||||
|
// Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works
|
||||||
|
// Pretty weirdly
|
||||||
|
scanForCalls(shaderUnit);
|
||||||
|
|
||||||
|
// Compile every instruction in the shader
|
||||||
|
// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
|
||||||
|
recompilerPC = 0;
|
||||||
|
compileUntil(shaderUnit, PICAShader::maxInstructionCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) {
|
||||||
|
returnPCs.clear();
|
||||||
|
|
||||||
|
for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
|
||||||
|
const u32 instruction = shaderUnit.loadedShader[i];
|
||||||
|
if (isCall(instruction)) {
|
||||||
|
const u32 num = instruction & 0xff; // Num of instructions to execute
|
||||||
|
const u32 dest = (instruction >> 10) & 0xfff; // Starting subroutine address
|
||||||
|
const u32 returnPC = num + dest; // Add them to get the return PC
|
||||||
|
|
||||||
|
returnPCs.push_back(returnPC);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
|
||||||
|
while (recompilerPC < end) {
|
||||||
|
compileInstruction(shaderUnit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
|
||||||
|
// Write current location to label for this instruction
|
||||||
|
L(instructionLabels[recompilerPC]);
|
||||||
|
// Fetch instruction and inc PC
|
||||||
|
const u32 instruction = shaderUnit.loadedShader[recompilerPC++];
|
||||||
|
const u32 opcode = instruction >> 26;
|
||||||
|
|
||||||
|
switch (opcode) {
|
||||||
|
default:
|
||||||
|
Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
|
@ -203,7 +203,12 @@ void GPU::drawArrays() {
|
||||||
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
||||||
}
|
}
|
||||||
|
|
||||||
shaderUnit.vs.run();
|
if constexpr (useShaderJIT) {
|
||||||
|
shaderJIT.run(shaderUnit.vs);
|
||||||
|
} else {
|
||||||
|
shaderUnit.vs.run();
|
||||||
|
}
|
||||||
|
|
||||||
std::memcpy(&vertices[i].position, &shaderUnit.vs.outputs[0], sizeof(vec4f));
|
std::memcpy(&vertices[i].position, &shaderUnit.vs.outputs[0], sizeof(vec4f));
|
||||||
std::memcpy(&vertices[i].colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
|
std::memcpy(&vertices[i].colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
|
||||||
std::memcpy(&vertices[i].UVs, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));
|
std::memcpy(&vertices[i].UVs, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue