Merge pull request #536 from wheremyfoodat/x64-non-ieee-pica-mul

x64 shader rec: Add support for PICA non-IEEE multiplication
2025-07-11 09:38:41 +12:00 · 2024-07-14 23:43:20 +00:00 · 2024-07-14 23:43:20 +00:00 · 61e2e71f68
commit 61e2e71f68
parent 8eab353491 133082c232
2 changed files with 142 additions and 23 deletions
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	Label negateVector;
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	Label onesVector;
 	// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
 	Label dp3Vector;
 	u32 recompilerPC = 0;  // PC the recompiler is currently recompiling @
 	u32 loopLevel = 0;     // The current loop nesting level (0 = not in a loop)
@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	Xbyak::Label emitExp2Func();
 	Xbyak::util::Cpu cpuCaps;
 	// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
 	void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
 	// Compile instruction "instr"
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@ -12,6 +12,9 @@ using namespace Xbyak;
 using namespace Xbyak::util;
 using namespace Helpers;
 // TODO: Expose safe/unsafe optimizations to the user
 constexpr bool useSafeMUL = false;
 // The shader recompiler uses quite an odd internal ABI
 // We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
 // This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	L(onesVector);
 	dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times
 	if (useSafeMUL) {
 		// When doing safe mul, we need a vector to set only the w component to 0 for DP3
 		L(dp3Vector);
 		dd(0xFFFFFFFF);
 		dd(0xFFFFFFFF);
 		dd(0xFFFFFFFF);
 		dd(0);
 	}
 	// Emit prologue first
 	align(16);
 	prologueCb = getCurr<PrologueCallback>();
@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA 
+
 	if (!useSafeMUL) {
 		dpps(src1_xmm, src2_xmm, 0b01111111);
 	} else {
 		const u32 writeMask = operandDescriptor & 0xf;
 		// Set w component to 0 and do a DP4
 		andps(src1_xmm, xword[rip + dp3Vector]);
 		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
 		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
 		haddps(src1_xmm, src1_xmm);
 		haddps(src1_xmm, src1_xmm);
 		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
 		// Otherwise we do
 		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 		}
 	}
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA 
+
 	if (!useSafeMUL) {
 		// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
 		dpps(src1_xmm, src2_xmm, 0b11111111);
 	} else {
 		const u32 writeMask = operandDescriptor & 0xf;
 		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
 		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
 		haddps(src1_xmm, src1_xmm);
 		haddps(src1_xmm, src1_xmm);
 		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
 		// Otherwise we do
 		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 		}
 	}
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);
@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
 		unpcklpd(src1_xmm, scratch1);
 	}
-	dpps(src1_xmm, src2_xmm, 0b11111111);  // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
+    // Now perform a DP4
 	if (!useSafeMUL) {
 		// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
 		dpps(src1_xmm, src2_xmm, 0b11111111);
 	} else {
 		const u32 writeMask = operandDescriptor & 0xf;
 		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
 		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
 		haddps(src1_xmm, src1_xmm);
 		haddps(src1_xmm, src1_xmm);
 		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
 		// Otherwise we do
 		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 		}
 	}
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	mulps(src1_xmm, src2_xmm);
+
 	if (!useSafeMUL) {
 		mulps(src1_xmm, src2_xmm);
 	} else {
 		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
 	}
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
 	loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
 	loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);
 	// TODO: Implement safe PICA mul
 	// If we have FMA3, optimize MAD to use FMA
-	if (haveFMA3) {
+	if (!useSafeMUL) {
-		vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
+		if (haveFMA3) {
-		storeRegister(src1_xmm, shader, dest, operandDescriptor);
+			vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
-	}
+			storeRegister(src1_xmm, shader, dest, operandDescriptor);
 	// If we don't have FMA3, do a multiplication and addition
 	else {
 		// Multiply src1 * src2
 		if (haveAVX) {
 			vmulps(scratch1, src1_xmm, src2_xmm);
 		} else {
 			movaps(scratch1, src1_xmm);
 			mulps(scratch1, src2_xmm);
 		}
 		// If we don't have FMA3, do a multiplication and addition
 		else {
 			// Multiply src1 * src2
 			if (haveAVX) {
 				vmulps(scratch1, src1_xmm, src2_xmm);
 			} else {
 				movaps(scratch1, src1_xmm);
 				mulps(scratch1, src2_xmm);
 			}
 			// Add src3
 			addps(scratch1, src3_xmm);
 			storeRegister(scratch1, shader, dest, operandDescriptor);
 		}
 	} else {
 		movaps(scratch1, src1_xmm);
 		emitSafeMUL(scratch1, src2_xmm, src1_xmm);
 		// Add src3
 		addps(scratch1, src3_xmm);
 		storeRegister(scratch1, shader, dest, operandDescriptor);
@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() {
 	return subroutine;
 }
 void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) {
 	// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
 	// This can be done by checking for NaNs before and after a multiplication
 	// To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7)
 	// Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps
 	// If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication
 	// result with the xor result
 	// Based on Citra implementation, particularly the AVX-512 version
 	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
 		const Xbyak::Opmask zeroMask = k1;
 		vmulps(scratch, src1, src2);
 		// Mask of any NaN values found in the result
 		vcmpunordps(zeroMask, scratch, scratch);
 		// Mask of any non-NaN inputs producing NaN results
 		vcmpordps(zeroMask | zeroMask, src1, src2);
 		knotb(zeroMask, zeroMask);
 		vmovaps(src1 | zeroMask | T_z, scratch);
 	} else {
 		if (haveAVX) {
 			vcmpordps(scratch, src1, src2);
 		} else {
 			movaps(scratch, src1);
 			cmpordps(scratch, src2);
 		}
 		mulps(src1, src2);
 		cmpunordps(src2, src1);
 		xorps(src2, scratch);
 		andps(src1, src2);
 	}
 }
 Xbyak::Label ShaderEmitter::emitExp2Func() {
 	Xbyak::Label subroutine;