Merge pull request #536 from wheremyfoodat/x64-non-ieee-pica-mul

x64 shader rec: Add support for PICA non-IEEE multiplication
This commit is contained in:
wheremyfoodat 2024-07-14 23:43:20 +00:00 committed by GitHub
commit 61e2e71f68
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 142 additions and 23 deletions

View file

@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
Label negateVector;
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
Label onesVector;
// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
Label dp3Vector;
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
Xbyak::Label emitExp2Func();
Xbyak::util::Cpu cpuCaps;
// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
// Compile all instructions from [current recompiler PC, end)
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
// Compile instruction "instr"

View file

@ -12,6 +12,9 @@ using namespace Xbyak;
using namespace Xbyak::util;
using namespace Helpers;
// TODO: Expose safe/unsafe optimizations to the user
constexpr bool useSafeMUL = false;
// The shader recompiler uses quite an odd internal ABI
// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
L(onesVector);
dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times
if (useSafeMUL) {
// When doing safe mul, we need a vector to set only the w component to 0 for DP3
L(dp3Vector);
dd(0xFFFFFFFF);
dd(0xFFFFFFFF);
dd(0xFFFFFFFF);
dd(0);
}
// Emit prologue first
align(16);
prologueCb = getCurr<PrologueCallback>();
@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
if (!useSafeMUL) {
dpps(src1_xmm, src2_xmm, 0b01111111);
} else {
const u32 writeMask = operandDescriptor & 0xf;
// Set w component to 0 and do a DP4
andps(src1_xmm, xword[rip + dp3Vector]);
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
haddps(src1_xmm, src1_xmm);
haddps(src1_xmm, src1_xmm);
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
// Otherwise we do
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
}
}
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
if (!useSafeMUL) {
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
dpps(src1_xmm, src2_xmm, 0b11111111);
} else {
const u32 writeMask = operandDescriptor & 0xf;
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
haddps(src1_xmm, src1_xmm);
haddps(src1_xmm, src1_xmm);
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
// Otherwise we do
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
}
}
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);
@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
unpcklpd(src1_xmm, scratch1);
}
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
// Now perform a DP4
if (!useSafeMUL) {
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
dpps(src1_xmm, src2_xmm, 0b11111111);
} else {
const u32 writeMask = operandDescriptor & 0xf;
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
haddps(src1_xmm, src1_xmm);
haddps(src1_xmm, src1_xmm);
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
// Otherwise we do
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
}
}
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
mulps(src1_xmm, src2_xmm);
if (!useSafeMUL) {
mulps(src1_xmm, src2_xmm);
} else {
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
}
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);
// TODO: Implement safe PICA mul
// If we have FMA3, optimize MAD to use FMA
if (haveFMA3) {
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
// If we don't have FMA3, do a multiplication and addition
else {
// Multiply src1 * src2
if (haveAVX) {
vmulps(scratch1, src1_xmm, src2_xmm);
} else {
movaps(scratch1, src1_xmm);
mulps(scratch1, src2_xmm);
if (!useSafeMUL) {
if (haveFMA3) {
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
storeRegister(src1_xmm, shader, dest, operandDescriptor);
}
// If we don't have FMA3, do a multiplication and addition
else {
// Multiply src1 * src2
if (haveAVX) {
vmulps(scratch1, src1_xmm, src2_xmm);
} else {
movaps(scratch1, src1_xmm);
mulps(scratch1, src2_xmm);
}
// Add src3
addps(scratch1, src3_xmm);
storeRegister(scratch1, shader, dest, operandDescriptor);
}
} else {
movaps(scratch1, src1_xmm);
emitSafeMUL(scratch1, src2_xmm, src1_xmm);
// Add src3
addps(scratch1, src3_xmm);
storeRegister(scratch1, shader, dest, operandDescriptor);
@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() {
return subroutine;
}
void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) {
// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
// This can be done by checking for NaNs before and after a multiplication
// To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7)
// Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps
// If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication
// result with the xor result
// Based on Citra implementation, particularly the AVX-512 version
if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
const Xbyak::Opmask zeroMask = k1;
vmulps(scratch, src1, src2);
// Mask of any NaN values found in the result
vcmpunordps(zeroMask, scratch, scratch);
// Mask of any non-NaN inputs producing NaN results
vcmpordps(zeroMask | zeroMask, src1, src2);
knotb(zeroMask, zeroMask);
vmovaps(src1 | zeroMask | T_z, scratch);
} else {
if (haveAVX) {
vcmpordps(scratch, src1, src2);
} else {
movaps(scratch, src1);
cmpordps(scratch, src2);
}
mulps(src1, src2);
cmpunordps(src2, src1);
xorps(src2, scratch);
andps(src1, src2);
}
}
Xbyak::Label ShaderEmitter::emitExp2Func() {
Xbyak::Label subroutine;