[Shader JIT] Add prologue & some more compilation stuffs

2025-07-16 04:07:08 +12:00 · 2023-06-08 22:22:28 +03:00 · 2023-06-08 22:22:28 +03:00 · 77cba3110d
commit 77cba3110d
parent 415e276ef9
5 changed files with 116 additions and 7 deletions
--- a/include/PICA/dynapica/shader_rec.hpp
+++ b/include/PICA/dynapica/shader_rec.hpp
@ -15,10 +15,10 @@ class ShaderJIT {
 #ifdef PANDA3DS_SHADER_JIT_SUPPORTED
 	using Hash = PICAShader::Hash;
 	using ShaderCache = std::unordered_map<Hash, std::unique_ptr<ShaderEmitter>>;
-	ShaderEmitter::Callback activeShaderCallback;
+	ShaderEmitter::PrologueCallback prologueCallback;
+	ShaderEmitter::InstructionCallback entrypointCallback;

 	ShaderCache cache;
-	void compileShader(PICAShader& shaderUnit);
 #endif

 public:
@ -26,8 +26,12 @@ public:
 	// Call this before starting to process a batch of vertices
 	// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
 	// If yes, it sets it as the active shader. if not, then it compiles it, adds it to the cache, and sets it as active,
+	// The caller must make sure the entrypoint has been properly set beforehand
 	void prepare(PICAShader& shaderUnit);
 	void reset();
+	void run(PICAShader& shaderUnit) {
+		prologueCallback(shaderUnit, entrypointCallback);
+	}

 	static constexpr bool isAvailable() { return true; }
 #else
@ -42,6 +46,4 @@ public:
 	void reset() {}
 	static constexpr bool isAvailable() { return false; }
 #endif
-
-	auto getCallback() { return activeShaderCallback; }
 };
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -7,17 +7,51 @@
 #include "xbyak/xbyak.h"
 #include "x64_regs.hpp"

+#include <vector>
+
 class ShaderEmitter : public Xbyak::CodeGenerator {
 	static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
 	// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
 	static constexpr size_t allocSize = executableMemorySize + 0x1000;

+	// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
+	std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
+	// A vector of PCs that can potentially return based on the state of the PICA callstack.
+	// Filled before compiling a shader by scanning the code for call instructions
+	std::vector<u32> returnPCs;
+
+	u32 recompilerPC; // PC the recompiler is currently recompiling @
+
+	// Compile all instructions from [current recompiler PC, end)
+	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
+	// Compile instruction "instr"
+	void compileInstruction(const PICAShader& shaderUnit);
+
+	bool isCall(u32 instruction) {
+		const u32 opcode = instruction >> 26;
+		return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
+	}
+	// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
+	void scanForCalls(const PICAShader& shader);
+
 public:
-	using Callback = void(*)(const PICAShader& shaderUnit);
+	using InstructionCallback = void(*)(PICAShader& shaderUnit); // Callback type used for instructions
+	// Callback type used for the JIT prologue. This is what the caller will call
+	using PrologueCallback = void(*)(PICAShader& shaderUnit, InstructionCallback cb);
+	PrologueCallback prologueCb;

 	// Initialize our emitter with "allocSize" bytes of RWX memory
 	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
 	void compile(const PICAShader& shaderUnit);
+
+	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
+	InstructionCallback getInstructionCallback(u32 pc) {
+		return reinterpret_cast<InstructionCallback>(instructionLabels.at(pc).getAddress());
+	}
+
+	PrologueCallback getPrologueCallback() {
+		return prologueCb;
+	}
 };

 #endif // x64 recompiler check
--- a/src/core/PICA/dynapica/shader_rec.cpp
+++ b/src/core/PICA/dynapica/shader_rec.cpp
@ -17,9 +17,15 @@ void ShaderJIT::prepare(PICAShader& shaderUnit) {
 	if (it == cache.end()) { // Block has not been compiled yet
 		auto emitter = std::make_unique<ShaderEmitter>();
 		emitter->compile(shaderUnit);
+		// Get pointer to callbacks
+		entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint);
+		prologueCallback = emitter->getPrologueCallback();
+
 		cache.emplace_hint(it, hash, std::move(emitter));
 	} else { // Block has been compiled and found, use it
 		auto emitter = it->second.get();
+		entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint);
+		prologueCallback = emitter->getPrologueCallback();
 	}
 }
 #endif // PANDA3DS_SHADER_JIT_SUPPORTED
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@ -4,8 +4,70 @@
 using namespace Xbyak;
 using namespace Xbyak::util;

-void ShaderEmitter::compile(const PICAShader& shaderUnit) {
+// Register that points to PICA state
+static constexpr Reg64 statePointer = rbp;

+void ShaderEmitter::compile(const PICAShader& shaderUnit) {
+	// Emit prologue first
+	align(16);
+	prologueCb = getCurr<PrologueCallback>();
+
+	// We assume arg1 contains the pointer to the PICA state and arg2 a pointer to the code for the entrypoint
+	push(statePointer); // Back up state pointer to stack. This also aligns rsp to 16 bytes for calls
+	mov(statePointer, (uintptr_t)&shaderUnit); // Set state pointer to the proper pointer
+
+	// If we add integer register allocations they should be pushed here, and the rsp should be properly fixed up
+	// However most of the PICA is floating point so yeah
+
+	// Allocate shadow stack on Windows
+	if constexpr (isWindows()) {
+		sub(rsp, 32);
+	}
+	// Tail call to shader code entrypoint
+	jmp(arg2);
+	align(16);
+	// Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works
+	// Pretty weirdly
+	scanForCalls(shaderUnit);
+
+	// Compile every instruction in the shader
+	// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
+	recompilerPC = 0;
+	compileUntil(shaderUnit, PICAShader::maxInstructionCount);
+}
+
+void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) {
+	returnPCs.clear();
+
+	for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
+		const u32 instruction = shaderUnit.loadedShader[i];
+		if (isCall(instruction)) {
+			const u32 num = instruction & 0xff; // Num of instructions to execute
+			const u32 dest = (instruction >> 10) & 0xfff; // Starting subroutine address
+			const u32 returnPC = num + dest; // Add them to get the return PC
+
+			returnPCs.push_back(returnPC);
+		}
+	}
+}
+
+void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
+	while (recompilerPC < end) {
+		compileInstruction(shaderUnit);
+	}
+}
+
+void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
+	// Write current location to label for this instruction
+	L(instructionLabels[recompilerPC]);
+	// Fetch instruction and inc PC
+	const u32 instruction = shaderUnit.loadedShader[recompilerPC++];
+	const u32 opcode = instruction >> 26;
+
+	switch (opcode) {
+		default:
+			Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode);
+	}
 }

 #endif
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -203,7 +203,12 @@ void GPU::drawArrays() {
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}
 		
-		shaderUnit.vs.run();
+		if constexpr (useShaderJIT) {
+			shaderJIT.run(shaderUnit.vs);
+		} else {
+			shaderUnit.vs.run();
+		}
+
 		std::memcpy(&vertices[i].position, &shaderUnit.vs.outputs[0], sizeof(vec4f));
 		std::memcpy(&vertices[i].colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
 		std::memcpy(&vertices[i].UVs, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));