WIP arm64 shader recompiler

2025-07-30 02:11:16 +12:00 · 2024-01-03 00:39:36 +02:00 · 2024-01-03 00:39:36 +02:00 · c0621d0760
commit c0621d0760
parent 281a7eefbf
7 changed files with 383 additions and 4 deletions
--- a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
@ -0,0 +1,239 @@
+#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST)
+#include "PICA/dynapica/shader_rec_emitter_arm64.hpp"
+
+#include <bit>
+
+using namespace Helpers;
+using namespace oaknut;
+using namespace oaknut::util;
+
+// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
+// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
+static constexpr QReg scratch1 = Q0;
+static constexpr QReg scratch2 = Q1;
+static constexpr QReg src1_vec = Q2;
+static constexpr QReg src2_vec = Q3;
+static constexpr QReg src3_vec = Q4;
+
+static constexpr XReg statePointer = X9;
+
+void ShaderEmitter::compile(const PICAShader& shaderUnit) {
+	// Scan the code for call, exp2, log2, etc instructions which need some special care
+	// After that, emit exp2 and log2 functions if the corresponding instructions are present
+	scanCode(shaderUnit);
+	if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2");
+	if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2");
+
+	align(16);
+	// Compile every instruction in the shader
+	// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
+	recompilerPC = 0;
+	loopLevel = 0;
+	compileUntil(shaderUnit, PICAShader::maxInstructionCount);
+}
+
+void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
+	returnPCs.clear();
+
+	for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
+		const u32 instruction = shaderUnit.loadedShader[i];
+		const u32 opcode = instruction >> 26;
+
+		if (isCall(instruction)) {
+			const u32 num = instruction & 0xff;
+			const u32 dest = getBits<10, 12>(instruction);
+			const u32 returnPC = num + dest;  // Add them to get the return PC
+
+			returnPCs.push_back(returnPC);
+		} else if (opcode == ShaderOpcodes::EX2) {
+			codeHasExp2 = true;
+		} else if (opcode == ShaderOpcodes::LG2) {
+			codeHasLog2 = true;
+		}
+	}
+
+	// Sort return PCs so they can be binary searched
+	std::sort(returnPCs.begin(), returnPCs.end());
+}
+
+void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) {
+	while (recompilerPC < end) {
+		compileInstruction(shaderUnit);
+	}
+}
+
+void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
+	// Write current location to label for this instruction
+	l(instructionLabels[recompilerPC]);
+
+	// See if PC is a possible return PC and emit the proper code if so
+	if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) {
+		Helpers::panic("Unimplemented return address for call instruction");
+	}
+
+	// Fetch instruction and inc PC
+	const u32 instruction = shaderUnit.loadedShader[recompilerPC++];
+	const u32 opcode = instruction >> 26;
+
+	switch (opcode) {
+		// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
+		// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
+		// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
+		// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
+		// case ShaderOpcodes::CMP1:
+		// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
+		// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
+		// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
+		// case ShaderOpcodes::DPH:
+		// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
+		// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
+		// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
+		// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
+		// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
+		// case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break;
+		// case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break;
+		// case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break;
+		// case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break;
+		// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
+		case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
+		// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
+		// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
+		// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
+		// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
+		case ShaderOpcodes::NOP:
+			break;
+			// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
+			// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
+
+			// Unimplemented opcodes that don't seem to actually be used but exist in the binary
+			// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
+			// case ShaderOpcodes::EMIT:
+			// case ShaderOpcodes::SETEMIT:
+			//	log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
+			//	emitPrintLog(shaderUnit);
+			//	break;
+
+			// case ShaderOpcodes::BREAK:
+			// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
+
+			// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
+			// case 0x30:
+			// case 0x31:
+			// case 0x32:
+			// case 0x33:
+			// case 0x34:
+			// case 0x35:
+			// case 0x36:
+			// case 0x37:
+			// case 0x38:
+			// case 0x39:
+			// case 0x3A:
+			// case 0x3B:
+			// case 0x3C:
+			// case 0x3D:
+			// case 0x3E:
+			// case 0x3F: recMAD(shaderUnit, instruction); break;
+
+			// case ShaderOpcodes::SLT:
+			// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
+
+			// case ShaderOpcodes::SGE:
+			// case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break;
+
+		default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
+	}
+}
+
+const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
+	if (src < 0x10)
+		return shader.inputs[src];
+	else if (src < 0x20)
+		return shader.tempRegisters[src - 0x10];
+	else if (src <= 0x7f)
+		return shader.floatUniforms[src - 0x20];
+	else {
+		Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
+		return shader.dummy;
+	}
+}
+
+const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
+	if (dest < 0x10) {
+		return shader.outputs[dest];
+	} else if (dest < 0x20) {
+		return shader.tempRegisters[dest - 0x10];
+	}
+	Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
+}
+
+// See shader.hpp header for docs on how the swizzle and negate works
+template <int sourceIndex>
+void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
+	u32 compSwizzle;  // Component swizzle pattern for the register
+	bool negate;      // If true, negate all lanes of the register
+
+	if constexpr (sourceIndex == 1) {  // SRC1
+		negate = (getBit<4>(operandDescriptor)) != 0;
+		compSwizzle = getBits<5, 8>(operandDescriptor);
+	} else if constexpr (sourceIndex == 2) {  // SRC2
+		negate = (getBit<13>(operandDescriptor)) != 0;
+		compSwizzle = getBits<14, 8>(operandDescriptor);
+	} else if constexpr (sourceIndex == 3) {  // SRC3
+		negate = (getBit<22>(operandDescriptor)) != 0;
+		compSwizzle = getBits<23, 8>(operandDescriptor);
+	}
+
+	switch (index) {
+		case 0:
+			[[likely]] {  // Keep src as is, no need to offset it
+				const vec4f& srcRef = getSourceRef(shader, src);
+				const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader);  // Calculate offset of register from start of the state struct
+
+				LDR(dest, statePointer, offset);
+				switch (compSwizzle) {
+					case noSwizzle: break;                              // .xyzw
+					case 0x0: DUP(dest.S4(), dest.Selem()[0]); break;   // .xxxx
+					case 0x55: DUP(dest.S4(), dest.Selem()[1]); break;  // .yyyy
+					case 0xAA: DUP(dest.S4(), dest.Selem()[2]); break;  // .zzzz
+					case 0xFF: DUP(dest.S4(), dest.Selem()[3]); break;  // .wwww
+					default: Helpers::panic("Unimplemented swizzle pattern for loading");
+				}
+
+				// Negate the register if necessary
+				if (negate) {
+					FNEG(dest.S4(), dest.S4());
+				}
+				return;  // Return. Rest of the function handles indexing which is not used if index == 0
+			}
+
+		default: Helpers::panic("[ShaderJIT]: Unimplemented source index type %d", index);
+	}
+
+	Helpers::panic("Unimplemented indexed register load");
+}
+
+void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
+	const vec4f& destRef = getDestRef(shader, dest);
+	const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader);  // Calculate offset of register from start of the state struct
+
+	// Mask of which lanes to write
+	u32 writeMask = operandDescriptor & 0xf;
+	if (writeMask == 0xf) {  // No lanes are masked, just use STR
+		STR(source, statePointer, offset);
+	} else {
+        LDR(scratch1, statePointer, offset); // Load current source
+        Helpers::panic("Unimplemented: Storing to register with blending");
+    }
+}
+
+void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src = getBits<12, 7>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor);  // Load source 1 into scratch1
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+#endif
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@ -342,10 +342,10 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
 	} else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss
 		int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc)
 		size_t index = 3 - bit;
-		const uintptr_t lane_offset = offset + index * sizeof(float);
+		const uintptr_t laneOffset = offset + index * sizeof(float);

 		if (index == 0) { // Bottom lane, no need to shift
-			movss(dword[statePointer + lane_offset], source);
+			movss(dword[statePointer + laneOffset], source);
 		} else { // Shift right by 32 * index, then write bottom lane
 			if (haveAVX) {
 				vpsrldq(scratch1, source, index * sizeof(float));
@ -353,7 +353,7 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
 				movaps(scratch1, source);
 				psrldq(scratch1, index * sizeof(float));
 			}
-			movss(dword[statePointer + lane_offset], scratch1);
+			movss(dword[statePointer + laneOffset], scratch1);
 		}
 	} else if (haveSSE4_1) {
 		// Bit reverse the write mask because that is what blendps expects