Merge pull request #536 from wheremyfoodat/x64-non-ieee-pica-mul

x64 shader rec: Add support for PICA non-IEEE multiplication
2025-07-16 12:17:08 +12:00 · 2024-07-14 23:43:20 +00:00 · 2024-07-14 23:43:20 +00:00 · 61e2e71f68
commit 61e2e71f68
parent 8eab353491 133082c232
2 changed files with 142 additions and 23 deletions
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	Label negateVector;
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	Label onesVector;
+	// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
+	Label dp3Vector;

 	u32 recompilerPC = 0;  // PC the recompiler is currently recompiling @
 	u32 loopLevel = 0;     // The current loop nesting level (0 = not in a loop)
@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	Xbyak::Label emitExp2Func();
 	Xbyak::util::Cpu cpuCaps;

+	// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
+	void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
+
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
 	// Compile instruction "instr"
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@ -12,6 +12,9 @@ using namespace Xbyak;
 using namespace Xbyak::util;
 using namespace Helpers;

+// TODO: Expose safe/unsafe optimizations to the user
+constexpr bool useSafeMUL = false;
+
 // The shader recompiler uses quite an odd internal ABI
 // We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
 // This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	L(onesVector);
 	dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times

+	if (useSafeMUL) {
+		// When doing safe mul, we need a vector to set only the w component to 0 for DP3
+		L(dp3Vector);
+
+		dd(0xFFFFFFFF);
+		dd(0xFFFFFFFF);
+		dd(0xFFFFFFFF);
+		dd(0);
+	}
+
 	// Emit prologue first
 	align(16);
 	prologueCb = getCurr<PrologueCallback>();
@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA 
+
+	if (!useSafeMUL) {
+		dpps(src1_xmm, src2_xmm, 0b01111111);
+	} else {
+		const u32 writeMask = operandDescriptor & 0xf;
+
+		// Set w component to 0 and do a DP4
+		andps(src1_xmm, xword[rip + dp3Vector]);
+
+		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
+		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
+		haddps(src1_xmm, src1_xmm);
+		haddps(src1_xmm, src1_xmm);
+
+		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
+		// Otherwise we do
+		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
+			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
+		}
+	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }

 void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA 
+
+	if (!useSafeMUL) {
+		// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
+		dpps(src1_xmm, src2_xmm, 0b11111111);
+	} else {
+		const u32 writeMask = operandDescriptor & 0xf;
+
+		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
+		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
+		haddps(src1_xmm, src1_xmm);
+		haddps(src1_xmm, src1_xmm);
+
+		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
+		// Otherwise we do
+		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
+			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
+		}
+	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }

@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);

@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
 		unpcklpd(src1_xmm, scratch1);
 	}

-	dpps(src1_xmm, src2_xmm, 0b11111111);  // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
+    // Now perform a DP4
+	if (!useSafeMUL) {
+		// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
+		dpps(src1_xmm, src2_xmm, 0b11111111);
+	} else {
+		const u32 writeMask = operandDescriptor & 0xf;
+
+		// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
+		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
+		haddps(src1_xmm, src1_xmm);
+		haddps(src1_xmm, src1_xmm);
+
+		// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
+		// Otherwise we do
+		if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
+			shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
+		}
+	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }

@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
-	mulps(src1_xmm, src2_xmm);
+
+	if (!useSafeMUL) {
+		mulps(src1_xmm, src2_xmm);
+	} else {
+		emitSafeMUL(src1_xmm, src2_xmm, scratch1);
+	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }

@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
 	loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
 	loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);

-	// TODO: Implement safe PICA mul
 	// If we have FMA3, optimize MAD to use FMA
-	if (haveFMA3) {
-		vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
-		storeRegister(src1_xmm, shader, dest, operandDescriptor);
-	}
-	
-	// If we don't have FMA3, do a multiplication and addition
-	else {
-		// Multiply src1 * src2
-		if (haveAVX) {
-			vmulps(scratch1, src1_xmm, src2_xmm);
-		} else {
-			movaps(scratch1, src1_xmm);
-			mulps(scratch1, src2_xmm);
+	if (!useSafeMUL) {
+		if (haveFMA3) {
+			vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
+			storeRegister(src1_xmm, shader, dest, operandDescriptor);
 		}

+		// If we don't have FMA3, do a multiplication and addition
+		else {
+			// Multiply src1 * src2
+			if (haveAVX) {
+				vmulps(scratch1, src1_xmm, src2_xmm);
+			} else {
+				movaps(scratch1, src1_xmm);
+				mulps(scratch1, src2_xmm);
+			}
+
+			// Add src3
+			addps(scratch1, src3_xmm);
+			storeRegister(scratch1, shader, dest, operandDescriptor);
+		}
+	} else {
+		movaps(scratch1, src1_xmm);
+		emitSafeMUL(scratch1, src2_xmm, src1_xmm);
+
 		// Add src3
 		addps(scratch1, src3_xmm);
 		storeRegister(scratch1, shader, dest, operandDescriptor);
@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() {
 	return subroutine;
 }

+void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) {
+	// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
+	// This can be done by checking for NaNs before and after a multiplication
+	// To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7)
+	// Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps
+	// If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication
+	// result with the xor result
+	// Based on Citra implementation, particularly the AVX-512 version
+
+	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
+		const Xbyak::Opmask zeroMask = k1;
+
+		vmulps(scratch, src1, src2);
+		// Mask of any NaN values found in the result
+		vcmpunordps(zeroMask, scratch, scratch);
+		// Mask of any non-NaN inputs producing NaN results
+		vcmpordps(zeroMask | zeroMask, src1, src2);
+
+		knotb(zeroMask, zeroMask);
+		vmovaps(src1 | zeroMask | T_z, scratch);
+	} else {
+		if (haveAVX) {
+			vcmpordps(scratch, src1, src2);
+		} else {
+			movaps(scratch, src1);
+			cmpordps(scratch, src2);
+		}
+
+		mulps(src1, src2);
+		cmpunordps(src2, src1);
+		xorps(src2, scratch);
+		andps(src1, src2);
+	}
+}
+
 Xbyak::Label ShaderEmitter::emitExp2Func() {
 	Xbyak::Label subroutine;