mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-07 22:55:40 +12:00
Merge pull request #536 from wheremyfoodat/x64-non-ieee-pica-mul
x64 shader rec: Add support for PICA non-IEEE multiplication
This commit is contained in:
commit
61e2e71f68
2 changed files with 142 additions and 23 deletions
|
@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
||||||
Label negateVector;
|
Label negateVector;
|
||||||
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
|
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
|
||||||
Label onesVector;
|
Label onesVector;
|
||||||
|
// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
|
||||||
|
Label dp3Vector;
|
||||||
|
|
||||||
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
|
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
|
||||||
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
|
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
|
||||||
|
@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
||||||
Xbyak::Label emitExp2Func();
|
Xbyak::Label emitExp2Func();
|
||||||
Xbyak::util::Cpu cpuCaps;
|
Xbyak::util::Cpu cpuCaps;
|
||||||
|
|
||||||
|
// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
|
||||||
|
void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
|
||||||
|
|
||||||
// Compile all instructions from [current recompiler PC, end)
|
// Compile all instructions from [current recompiler PC, end)
|
||||||
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
||||||
// Compile instruction "instr"
|
// Compile instruction "instr"
|
||||||
|
|
|
@ -12,6 +12,9 @@ using namespace Xbyak;
|
||||||
using namespace Xbyak::util;
|
using namespace Xbyak::util;
|
||||||
using namespace Helpers;
|
using namespace Helpers;
|
||||||
|
|
||||||
|
// TODO: Expose safe/unsafe optimizations to the user
|
||||||
|
constexpr bool useSafeMUL = false;
|
||||||
|
|
||||||
// The shader recompiler uses quite an odd internal ABI
|
// The shader recompiler uses quite an odd internal ABI
|
||||||
// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
|
// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
|
||||||
// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
|
// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
|
||||||
|
@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
||||||
L(onesVector);
|
L(onesVector);
|
||||||
dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times
|
dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times
|
||||||
|
|
||||||
|
if (useSafeMUL) {
|
||||||
|
// When doing safe mul, we need a vector to set only the w component to 0 for DP3
|
||||||
|
L(dp3Vector);
|
||||||
|
|
||||||
|
dd(0xFFFFFFFF);
|
||||||
|
dd(0xFFFFFFFF);
|
||||||
|
dd(0xFFFFFFFF);
|
||||||
|
dd(0);
|
||||||
|
}
|
||||||
|
|
||||||
// Emit prologue first
|
// Emit prologue first
|
||||||
align(16);
|
align(16);
|
||||||
prologueCb = getCurr<PrologueCallback>();
|
prologueCb = getCurr<PrologueCallback>();
|
||||||
|
@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
|
||||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||||
dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
|
||||||
|
if (!useSafeMUL) {
|
||||||
|
dpps(src1_xmm, src2_xmm, 0b01111111);
|
||||||
|
} else {
|
||||||
|
const u32 writeMask = operandDescriptor & 0xf;
|
||||||
|
|
||||||
|
// Set w component to 0 and do a DP4
|
||||||
|
andps(src1_xmm, xword[rip + dp3Vector]);
|
||||||
|
|
||||||
|
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||||
|
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
|
||||||
|
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||||
|
// Otherwise we do
|
||||||
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
|
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||||
const u32 src1 = getBits<12, 7>(instruction);
|
const u32 src1 = getBits<12, 7>(instruction);
|
||||||
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
|
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
|
||||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||||
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
|
||||||
|
if (!useSafeMUL) {
|
||||||
|
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||||
|
dpps(src1_xmm, src2_xmm, 0b11111111);
|
||||||
|
} else {
|
||||||
|
const u32 writeMask = operandDescriptor & 0xf;
|
||||||
|
|
||||||
|
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||||
|
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
|
||||||
|
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||||
|
// Otherwise we do
|
||||||
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
|
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
|
||||||
loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
|
loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);
|
loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);
|
||||||
|
|
||||||
|
@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
|
||||||
unpcklpd(src1_xmm, scratch1);
|
unpcklpd(src1_xmm, scratch1);
|
||||||
}
|
}
|
||||||
|
|
||||||
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
// Now perform a DP4
|
||||||
|
if (!useSafeMUL) {
|
||||||
|
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||||
|
dpps(src1_xmm, src2_xmm, 0b11111111);
|
||||||
|
} else {
|
||||||
|
const u32 writeMask = operandDescriptor & 0xf;
|
||||||
|
|
||||||
|
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||||
|
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
haddps(src1_xmm, src1_xmm);
|
||||||
|
|
||||||
|
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||||
|
// Otherwise we do
|
||||||
|
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||||
|
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
||||||
const u32 idx = getBits<19, 2>(instruction);
|
const u32 idx = getBits<19, 2>(instruction);
|
||||||
const u32 dest = getBits<21, 5>(instruction);
|
const u32 dest = getBits<21, 5>(instruction);
|
||||||
|
|
||||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
|
||||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||||
mulps(src1_xmm, src2_xmm);
|
|
||||||
|
if (!useSafeMUL) {
|
||||||
|
mulps(src1_xmm, src2_xmm);
|
||||||
|
} else {
|
||||||
|
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||||
|
}
|
||||||
|
|
||||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
||||||
loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
||||||
loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
||||||
|
|
||||||
// TODO: Implement safe PICA mul
|
|
||||||
// If we have FMA3, optimize MAD to use FMA
|
// If we have FMA3, optimize MAD to use FMA
|
||||||
if (haveFMA3) {
|
if (!useSafeMUL) {
|
||||||
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
|
if (haveFMA3) {
|
||||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
|
||||||
}
|
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||||
|
|
||||||
// If we don't have FMA3, do a multiplication and addition
|
|
||||||
else {
|
|
||||||
// Multiply src1 * src2
|
|
||||||
if (haveAVX) {
|
|
||||||
vmulps(scratch1, src1_xmm, src2_xmm);
|
|
||||||
} else {
|
|
||||||
movaps(scratch1, src1_xmm);
|
|
||||||
mulps(scratch1, src2_xmm);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we don't have FMA3, do a multiplication and addition
|
||||||
|
else {
|
||||||
|
// Multiply src1 * src2
|
||||||
|
if (haveAVX) {
|
||||||
|
vmulps(scratch1, src1_xmm, src2_xmm);
|
||||||
|
} else {
|
||||||
|
movaps(scratch1, src1_xmm);
|
||||||
|
mulps(scratch1, src2_xmm);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add src3
|
||||||
|
addps(scratch1, src3_xmm);
|
||||||
|
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
movaps(scratch1, src1_xmm);
|
||||||
|
emitSafeMUL(scratch1, src2_xmm, src1_xmm);
|
||||||
|
|
||||||
// Add src3
|
// Add src3
|
||||||
addps(scratch1, src3_xmm);
|
addps(scratch1, src3_xmm);
|
||||||
storeRegister(scratch1, shader, dest, operandDescriptor);
|
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||||
|
@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() {
|
||||||
return subroutine;
|
return subroutine;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) {
|
||||||
|
// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
|
||||||
|
// This can be done by checking for NaNs before and after a multiplication
|
||||||
|
// To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7)
|
||||||
|
// Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps
|
||||||
|
// If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication
|
||||||
|
// result with the xor result
|
||||||
|
// Based on Citra implementation, particularly the AVX-512 version
|
||||||
|
|
||||||
|
if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
|
||||||
|
const Xbyak::Opmask zeroMask = k1;
|
||||||
|
|
||||||
|
vmulps(scratch, src1, src2);
|
||||||
|
// Mask of any NaN values found in the result
|
||||||
|
vcmpunordps(zeroMask, scratch, scratch);
|
||||||
|
// Mask of any non-NaN inputs producing NaN results
|
||||||
|
vcmpordps(zeroMask | zeroMask, src1, src2);
|
||||||
|
|
||||||
|
knotb(zeroMask, zeroMask);
|
||||||
|
vmovaps(src1 | zeroMask | T_z, scratch);
|
||||||
|
} else {
|
||||||
|
if (haveAVX) {
|
||||||
|
vcmpordps(scratch, src1, src2);
|
||||||
|
} else {
|
||||||
|
movaps(scratch, src1);
|
||||||
|
cmpordps(scratch, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
mulps(src1, src2);
|
||||||
|
cmpunordps(src2, src1);
|
||||||
|
xorps(src2, scratch);
|
||||||
|
andps(src1, src2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Xbyak::Label ShaderEmitter::emitExp2Func() {
|
Xbyak::Label ShaderEmitter::emitExp2Func() {
|
||||||
Xbyak::Label subroutine;
|
Xbyak::Label subroutine;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue