[Shader JIT] Get first JIT trangle

2025-07-11 09:38:41 +12:00 · 2023-06-09 02:28:59 +03:00 · 2023-06-09 02:28:59 +03:00 · fd411245fa
commit fd411245fa
parent 9bb1f31fc9
3 changed files with 82 additions and 18 deletions
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -51,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	const vec4f& getDestRef(const PICAShader& shader, u32 dest);

 	// Instruction recompilation functions
+	void recADD(const PICAShader& shader, u32 instruction);
+	void recDP4(const PICAShader& shader, u32 instruction);
+	void recEND(const PICAShader& shader, u32 instruction);
 	void recMOV(const PICAShader& shader, u32 instruction);

 public:
@ -64,6 +67,9 @@ public:
 		const auto cpu = Xbyak::util::Cpu();

 		haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
+		if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
+			Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
+		}
 	}
 	
 	void compile(const PICAShader& shaderUnit);
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@ -73,6 +73,18 @@ class PICAShader {

 	std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data

+public:
+	// These are placed close to the temp registers and co because it helps the JIT generate better code
+	u32 entrypoint = 0; // Initial shader PC
+	u32 boolUniform;
+	std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
+	alignas(16) std::array<vec4f, 96> floatUniforms;
+
+	alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
+	alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
+	alignas(16) std::array<vec4f, 16> outputs;
+	alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT
+
 protected:
 	std::array<u32, 128> operandDescriptors;
 	alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
@ -191,15 +203,6 @@ public:
 	std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
 	std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to

-	u32 entrypoint = 0; // Initial shader PC
-	u32 boolUniform;
-	std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
-	alignas(16) std::array<vec4f, 96> floatUniforms;
-
-	alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
-	alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
-	alignas(16) std::array<vec4f, 16> outputs;
-
 	PICAShader(ShaderType type) : type(type) {}

 	// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@ -2,6 +2,7 @@
 #include "PICA/dynapica/shader_rec_emitter_x64.hpp"

 #include <algorithm>
+#include <bit>
 #include <cstddef>

 using namespace Xbyak;
@ -11,7 +12,9 @@ using namespace Xbyak::util;
 static constexpr Reg64 statePointer = rbp;
 static constexpr Xmm scratch1 = xmm0;
 static constexpr Xmm scratch2 = xmm1;
-static constexpr Xmm scratch3 = xmm2;
+static constexpr Xmm src1_xmm = xmm2;
+static constexpr Xmm src2_xmm = xmm3;
+static constexpr Xmm src3_xmm = xmm4;

 void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	// Emit prologue first
@ -71,15 +74,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 	const u32 opcode = instruction >> 26;

 	switch (opcode) {
+		case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
+		case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
+		case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
 		case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
+		case ShaderOpcodes::NOP: break;
 		default:
 			Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
 	}
 }

 const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
-	alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
-
 	if (src < 0x10)
 		return shader.inputs[src];
 	else if (src < 0x20)
@ -88,7 +93,7 @@ const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader
 		return shader.floatUniforms[src - 0x20];
 	else {
 		Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
-		return dummy;
+		return shader.dummy;
 	}
 }

@ -132,7 +137,7 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
 				movaps(dest, xword[statePointer + offset]);
 			else // Swizzle is not trivial so we need to emit a shuffle instruction
 				pshufd(dest, xword[statePointer + offset], convertedSwizzle);
-			return;
+			break;
 		}
 		
 		default:
@ -142,8 +147,6 @@ void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u3
 	if (negate) {
 		Helpers::panic("[ShaderJIT] Unimplemented register negation");
 	}
-
-	Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
 }

 void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
@ -151,9 +154,22 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
 	const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct

 	// Mask of which lanes to write
+	// TODO: If only 1 lane is being written to, use movss
 	u32 writeMask = operandDescriptor & 0xf;
 	if (writeMask == 0xf) { // No lanes are masked, just movaps
 		movaps(xword[statePointer + offset], source);
+	} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
+		int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
+		size_t index = 3 - bit;
+		const uintptr_t lane_offset = offset + index * sizeof(float);
+
+		if (index == 0) { // Bottom lane, no need to shift
+			movss(dword[statePointer + lane_offset], source);
+		} else { // Shift right by 32 * index, then write bottom lane
+			movaps(scratch1, source);
+			psrldq(scratch1, index * sizeof(float));
+			movss(dword[statePointer + lane_offset], scratch1);
+		}
 	} else if (haveSSE4_1) {
 		// Bit reverse the write mask because that is what blendps expects
 		u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
@ -176,14 +192,53 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
 	}
 }

+void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) {
+	// Undo anything the prologue did and return
+	// Dellocate shadow stack on Windows
+	if constexpr (isWindows()) {
+		add(rsp, 32);
+	}
+
+	// Restore registers
+	pop(statePointer);
+	ret();
+}
+
 void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	u32 src = (instruction >> 12) & 0x7f;
 	const u32 idx = (instruction >> 19) & 3;
 	const u32 dest = (instruction >> 21) & 0x1f;

-	loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
-	storeRegister(scratch1, shader, dest, operandDescriptor);
+	loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
+	storeRegister(src1_xmm, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	u32 src1 = (instruction >> 12) & 0x7f;
+	const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
+	const u32 idx = (instruction >> 19) & 3;
+	const u32 dest = (instruction >> 21) & 0x1f;
+
+	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
+	addps(src1_xmm, src2_xmm); // Dot product between the 2 register
+	storeRegister(src1_xmm, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	u32 src1 = (instruction >> 12) & 0x7f;
+	const u32 src2 = (instruction >> 7) & 0x1f; // src2 coming first because PICA moment
+	const u32 idx = (instruction >> 19) & 3;
+	const u32 dest = (instruction >> 21) & 0x1f;
+
+	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
+	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
+	dpps(src1_xmm, src2_xmm, 0b11111111); // Dot product between the 2 register, store the result in all lanes of scratch1 similarly to PICA 
+	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }

 #endif