From f43c252e55ec4d5ffff8175c57b309f973ec0337 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 16 Jul 2023 01:34:59 +0300
Subject: [PATCH] Switch LOG2/EX2 from x87 to SSE (adjusted from Citra)

Co-Authored-By: merry <8682882+merryhime@users.noreply.github.com>
Co-Authored-By: Wunk <wunkolo@gmail.com>
---
 .../PICA/dynapica/shader_rec_emitter_x64.hpp  |  23 +-
 .../PICA/dynapica/shader_rec_emitter_x64.cpp  | 279 +++++++++++++++---
 2 files changed, 252 insertions(+), 50 deletions(-)
diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
index ba37595a..c0665d19 100644
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@@ -40,6 +40,15 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	bool haveAVX = false;     // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX)
 	bool haveFMA3 = false;    // Shows if the CPU supports FMA3
 
+	// Shows whether the loaded shader has any log2 and exp2 instructions
+	bool codeHasLog2 = false;
+	bool codeHasExp2 = false;
+	
+	Xbyak::Label log2Func, exp2Func;
+	Xbyak::Label emitLog2Func();
+	Xbyak::Label emitExp2Func();
+	Xbyak::util::Cpu cpuCaps;
+
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
 	// Compile instruction "instr"
@@ -49,8 +58,10 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 		const u32 opcode = instruction >> 26;
 		return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
 	}
+
 	// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
-	void scanForCalls(const PICAShader& shaderUnit);
+	// We also scan for log2/exp2 instructions to see whether to emit the relevant functions
+	void scanCode(const PICAShader& shaderUnit);
 
 	// Load register with number "srcReg" indexed by index "idx" into the xmm register "reg"
 	template <int sourceIndex>
@@ -113,13 +124,13 @@ public:
 
 	// Initialize our emitter with "allocSize" bytes of RWX memory
 	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
-		const auto cpu = Xbyak::util::Cpu();
+		cpuCaps = Xbyak::util::Cpu();
 
-		haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
-		haveAVX = cpu.has(Xbyak::util::Cpu::tAVX);
-		haveFMA3 = cpu.has(Xbyak::util::Cpu::tFMA);
+		haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41);
+		haveAVX = cpuCaps.has(Xbyak::util::Cpu::tAVX);
+		haveFMA3 = cpuCaps.has(Xbyak::util::Cpu::tFMA);
 
-		if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
+		if (!cpuCaps.has(Xbyak::util::Cpu::tSSE3)) {
 			Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
 		}
 	}
diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
index 06247950..7f4eb00c 100644
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@@ -61,11 +61,14 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 
 	// Tail call to shader code entrypoint
 	jmp(arg2);
-	align(16);
-	// Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works
-	// Pretty weirdly
-	scanForCalls(shaderUnit);
 
+	// Scan the code for call, exp2, log2, etc instructions which need some special care
+	// After that, emit exp2 and log2 functions if the corresponding instructions are present
+	scanCode(shaderUnit);
+	if (codeHasExp2) exp2Func = emitExp2Func();
+	if (codeHasLog2) log2Func = emitLog2Func();
+
+	align(16);
 	// Compile every instruction in the shader
 	// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
 	recompilerPC = 0;
@@ -73,17 +76,23 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	compileUntil(shaderUnit, PICAShader::maxInstructionCount);
 }
 
-void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) {
+void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
 	returnPCs.clear();
 
 	for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
 		const u32 instruction = shaderUnit.loadedShader[i];
+		const u32 opcode = instruction >> 26;
+
 		if (isCall(instruction)) {
 			const u32 num = instruction & 0xff;
 			const u32 dest = getBits<10, 12>(instruction);
 			const u32 returnPC = num + dest; // Add them to get the return PC
 
 			returnPCs.push_back(returnPC);
+		} else if (opcode == ShaderOpcodes::EX2) {
+			codeHasExp2 = true;
+		} else if (opcode == ShaderOpcodes::LG2) {
+			codeHasLog2 = true;
 		}
 	}
 
@@ -877,7 +886,6 @@ void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) {
 	loopLevel--;
 }
 
-// SSE does not have a log2 instruction so we temporarily emulate this using x87 FPU
 void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src = getBits<12, 7>(instruction);
@@ -885,30 +893,16 @@ void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) {
 	const u32 dest = getBits<21, 5>(instruction);
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
-	// Load swizzled source, push 1.0 to the x87 stack
 	loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor);
-	fld1();
-
-	// Push source to the x87 stack
-	movd(eax, src1_xmm);
-	push(rax);
-	fld(dword[rsp]);
-
-	// Perform log2, load result to src1_xmm, write it back and undo the previous push rax
-	fyl2x();
-	fstp(dword[rsp]);
-	movss(src1_xmm, dword[rsp]);
-	add(rsp, 8);
-
-	// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
-	// Otherwise we do
+	call(log2Func); // Result is output in src1_xmm
+	
 	if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 		shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
-// SSE does not have an exp2 instruction so we temporarily emulate this using x87 FPU
 void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src = getBits<12, 7>(instruction);
@@ -917,31 +911,12 @@ void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) {
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
 	loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor);
+	call(exp2Func);  // Result is output in src1_xmm
 
-	// Push source to the x87 stack, then do some insane compiler-generated x87 math
-	movd(eax, src1_xmm);
-	push(rax);
-	fld(dword[rsp]);
-
-	fld(st0);
-	frndint();
-	fsub(st1, st0);
-	fxch(st1);
-	f2xm1();
-	fadd(dword[rip + onesVector]);
-	fscale();
-
-	// Load result to src1_xmm, write it back and undo the previous push rax
-	fstp(st1);
-	fstp(dword[rsp]);
-	movss(src1_xmm, dword[rsp]);
-	add(rsp, 8);
-
-	// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
-	// Otherwise we do
 	if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 		shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
@@ -962,6 +937,222 @@ void ShaderEmitter::printLog(const PICAShader& shaderUnit) {
 	printf("cmp: (%d, %d)\n", shaderUnit.cmpRegister[0], shaderUnit.cmpRegister[1]);
 }
 
+// For EXP2/LOG2, we have permission to adjust and relicense the SSE implementation from Citra for this project from the original authors
+// So we do it since EXP2/LOG2 are pretty terrible to implement.
+// ABI: Input is in the bottom bits of src1_xmm, same for output. If the result needs swizzling, the caller must handle it
+// Assume src1, src2, scratch1, scratch2, eax, edx all thrashed
+
+Xbyak::Label ShaderEmitter::emitLog2Func() {
+	Xbyak::Label subroutine;
+
+	// SSE does not have a log instruction, thus we must approximate.
+	// We perform this approximation first performaing a range reduction into the range [1.0, 2.0).
+	// A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated.
+	// We multiply the result by (x - 1) then restore the result into the appropriate range.
+
+	// Coefficients for the minimax polynomial.
+	// f(x) computes approximately log2(x) / (x - 1).
+	// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
+	align(64);
+	const void* c0 = getCurr();
+	dd(0x3d74552f);
+	const void* c1 = getCurr();
+	dd(0xbeee7397);
+	const void* c2 = getCurr();
+	dd(0x3fbd96dd);
+	const void* c3 = getCurr();
+	dd(0xc02153f6);
+	const void* c4 = getCurr();
+	dd(0x4038d96c);
+
+	align(16);
+	const void* negative_infinity_vector = getCurr();
+	dd(0xff800000);
+	dd(0xff800000);
+	dd(0xff800000);
+	dd(0xff800000);
+	const void* default_qnan_vector = getCurr();
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+
+	Xbyak::Label inputIsNan, inputIsZero, inputOutOfRange;
+
+	align(16);
+	L(inputOutOfRange);
+	je(inputIsZero);
+	movaps(src1_xmm, xword[rip + default_qnan_vector]);
+	ret();
+	L(inputIsZero);
+	movaps(src1_xmm, xword[rip + negative_infinity_vector]);
+	ret();
+
+	align(16);
+	L(subroutine);
+
+	// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
+	xorps(scratch1, scratch1);
+	ucomiss(scratch1, src1_xmm);
+	jp(inputIsNan);
+	jae(inputOutOfRange);
+
+	// Split input: SRC1=MANT[1,2) SCRATCH2=Exponent
+	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
+		vgetexpss(scratch2, src1_xmm, src1_xmm);
+		vgetmantss(src1_xmm, src1_xmm, src1_xmm, 0);
+	} else {
+		movd(eax, src1_xmm);
+		mov(edx, eax);
+		and_(eax, 0x7f800000);
+		and_(edx, 0x007fffff);
+		or_(edx, 0x3f800000);
+		movd(src1_xmm, edx);
+		// SRC1 now contains the mantissa of the input.
+		shr(eax, 23);
+		sub(eax, 0x7f);
+		cvtsi2ss(scratch2, eax);
+		// scratch2 now contains the exponent of the input.
+	}
+
+	movss(scratch1, xword[rip + c0]);
+
+	// Complete computation of polynomial
+	if (haveFMA3) {
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c1]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c2]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c3]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c4]);
+		subss(src1_xmm, dword[rip + onesVector]);
+		vfmadd231ss(scratch2, scratch1, src1_xmm);
+	} else {
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c1]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c2]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c3]);
+		mulss(scratch1, src1_xmm);
+		subss(src1_xmm, dword[rip + onesVector]);
+		addss(scratch1, xword[rip + c4]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch2, scratch1);
+	}
+
+	xorps(src1_xmm, src1_xmm);  // break dependency chain
+	movss(src1_xmm, scratch2);
+	L(inputIsNan);
+
+	ret();
+	return subroutine;
+}
+
+Xbyak::Label ShaderEmitter::emitExp2Func() {
+	Xbyak::Label subroutine;
+
+	// SSE does not have a exp instruction, thus we must approximate.
+	// We perform this approximation first performaing a range reduction into the range [-0.5, 0.5).
+	// A minimax polynomial which was fit for the function exp2(x) is then evaluated.
+	// We then restore the result into the appropriate range.
+
+	align(64);
+	const void* input_max = getCurr();
+	dd(0x43010000);
+	const void* input_min = getCurr();
+	dd(0xc2fdffff);
+	const void* c0 = getCurr();
+	dd(0x3c5dbe69);
+	const void* half = getCurr();
+	dd(0x3f000000);
+	const void* c1 = getCurr();
+	dd(0x3d5509f9);
+	const void* c2 = getCurr();
+	dd(0x3e773cc5);
+	const void* c3 = getCurr();
+	dd(0x3f3168b3);
+	const void* c4 = getCurr();
+	dd(0x3f800016);
+
+	Xbyak::Label retLabel;
+
+	align(16);
+	L(subroutine);
+
+	// Handle edge cases
+	ucomiss(src1_xmm, src1_xmm);
+	jp(retLabel);
+
+	// Decompose input:
+	// SCRATCH=2^round(input)
+	// SRC1=input-round(input) [-0.5, 0.5)
+	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
+		// Cheat a bit and store ones in src2 since the register is unused
+		vmovaps(src2_xmm, xword[rip + onesVector]);
+		// input - 0.5
+		vsubss(scratch1, src1_xmm, xword[rip + half]);
+
+		// trunc(input - 0.5)
+		vrndscaless(scratch2, scratch1, scratch1, _MM_FROUND_TRUNC);
+
+		// SCRATCH = 1 * 2^(trunc(input - 0.5))
+		vscalefss(scratch1, src2_xmm, scratch2);
+
+		// SRC1 = input-trunc(input - 0.5)
+		vsubss(src1_xmm, src1_xmm, scratch2);
+	} else {
+		// Clamp to maximum range since we shift the value directly into the exponent.
+		minss(src1_xmm, xword[rip + input_max]);
+		maxss(src1_xmm, xword[rip + input_min]);
+
+		if (cpuCaps.has(Cpu::tAVX)) {
+			vsubss(scratch1, src1_xmm, xword[rip + half]);
+		} else {
+			movss(scratch1, src1_xmm);
+			subss(scratch1, xword[rip + half]);
+		}
+
+		if (cpuCaps.has(Cpu::tSSE41)) {
+			roundss(scratch1, scratch1, _MM_FROUND_TRUNC);
+			cvtss2si(eax, scratch1);
+		} else {
+			cvtss2si(eax, scratch1);
+			cvtsi2ss(scratch1, eax);
+		}
+		// SCRATCH now contains input rounded to the nearest integer.
+		add(eax, 0x7f);
+		subss(src1_xmm, scratch1);
+		// SRC1 contains input - round(input), which is in [-0.5, 0.5).
+		shl(eax, 23);
+		movd(scratch1, eax);
+		// SCRATCH contains 2^(round(input)).
+	}
+
+	// Complete computation of polynomial.
+	movss(scratch2, xword[rip + c0]);
+
+	if (haveFMA3) {
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c1]);
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c2]);
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c3]);
+		vfmadd213ss(src1_xmm, scratch2, xword[rip + c4]);
+	} else {
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c1]);
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c2]);
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c3]);
+		mulss(src1_xmm, scratch2);
+		addss(src1_xmm, xword[rip + c4]);
+	}
+
+	mulss(src1_xmm, scratch1);
+	L(retLabel);
+
+	ret();
+	return subroutine;
+}
+
 // As we mentioned above, this function is uber slow because we don't expect the shader JIT to call HLL functions in real scenarios
 // Aside from debugging code. So we don't care for this function to be performant or anything of the like.  It is quick and dirty
 // And mostly meant to be used for generating logs to diff the JIT and interpreter