mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 14:15:41 +12:00
Merge branch 'master' into specialized-shaders-2
This commit is contained in:
commit
fe4bbea2ef
2 changed files with 142 additions and 23 deletions
|
@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
|||
Label negateVector;
|
||||
// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
|
||||
Label onesVector;
|
||||
// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
|
||||
Label dp3Vector;
|
||||
|
||||
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
|
||||
u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop)
|
||||
|
@ -49,6 +51,9 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
|
|||
Xbyak::Label emitExp2Func();
|
||||
Xbyak::util::Cpu cpuCaps;
|
||||
|
||||
// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
|
||||
void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
|
||||
|
||||
// Compile all instructions from [current recompiler PC, end)
|
||||
void compileUntil(const PICAShader& shaderUnit, u32 endPC);
|
||||
// Compile instruction "instr"
|
||||
|
|
|
@ -12,6 +12,9 @@ using namespace Xbyak;
|
|||
using namespace Xbyak::util;
|
||||
using namespace Helpers;
|
||||
|
||||
// TODO: Expose safe/unsafe optimizations to the user
|
||||
constexpr bool useSafeMUL = false;
|
||||
|
||||
// The shader recompiler uses quite an odd internal ABI
|
||||
// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
|
||||
// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers
|
||||
|
@ -45,6 +48,16 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
|
|||
L(onesVector);
|
||||
dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); dd(0x3f800000); // 1.0 4 times
|
||||
|
||||
if (useSafeMUL) {
|
||||
// When doing safe mul, we need a vector to set only the w component to 0 for DP3
|
||||
L(dp3Vector);
|
||||
|
||||
dd(0xFFFFFFFF);
|
||||
dd(0xFFFFFFFF);
|
||||
dd(0xFFFFFFFF);
|
||||
dd(0);
|
||||
}
|
||||
|
||||
// Emit prologue first
|
||||
align(16);
|
||||
prologueCb = getCurr<PrologueCallback>();
|
||||
|
@ -523,24 +536,60 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
|
|||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||
dpps(src1_xmm, src2_xmm, 0b01111111); // 3-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||
|
||||
if (!useSafeMUL) {
|
||||
dpps(src1_xmm, src2_xmm, 0b01111111);
|
||||
} else {
|
||||
const u32 writeMask = operandDescriptor & 0xf;
|
||||
|
||||
// Set w component to 0 and do a DP4
|
||||
andps(src1_xmm, xword[rip + dp3Vector]);
|
||||
|
||||
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
|
||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||
// Otherwise we do
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||
}
|
||||
}
|
||||
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
|
||||
const u32 src1 = getBits<12, 7>(instruction);
|
||||
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
|
||||
const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
|
||||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||
|
||||
if (!useSafeMUL) {
|
||||
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||
dpps(src1_xmm, src2_xmm, 0b11111111);
|
||||
} else {
|
||||
const u32 writeMask = operandDescriptor & 0xf;
|
||||
|
||||
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
|
||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||
// Otherwise we do
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||
}
|
||||
}
|
||||
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
|
@ -553,7 +602,6 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
|
|||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_xmm, shader, src1, isDPHI ? 0 : idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, isDPHI ? idx : 0, operandDescriptor);
|
||||
|
||||
|
@ -566,7 +614,25 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
|
|||
unpcklpd(src1_xmm, scratch1);
|
||||
}
|
||||
|
||||
dpps(src1_xmm, src2_xmm, 0b11111111); // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||
// Now perform a DP4
|
||||
if (!useSafeMUL) {
|
||||
// 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
|
||||
dpps(src1_xmm, src2_xmm, 0b11111111);
|
||||
} else {
|
||||
const u32 writeMask = operandDescriptor & 0xf;
|
||||
|
||||
// Set src1 to src1 * src2, then get the dot product by doing 2 horizontal adds
|
||||
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
haddps(src1_xmm, src1_xmm);
|
||||
|
||||
// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
|
||||
// Otherwise we do
|
||||
if (writeMask != 0x8) { // Copy bottom lane to all lanes if we're not simply writing back x
|
||||
shufps(src1_xmm, src1_xmm, 0); // src1_xmm = src1_xmm.xxxx
|
||||
}
|
||||
}
|
||||
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
|
@ -603,10 +669,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
|
|||
const u32 idx = getBits<19, 2>(instruction);
|
||||
const u32 dest = getBits<21, 5>(instruction);
|
||||
|
||||
// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
|
||||
loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
|
||||
loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
|
||||
mulps(src1_xmm, src2_xmm);
|
||||
|
||||
if (!useSafeMUL) {
|
||||
mulps(src1_xmm, src2_xmm);
|
||||
} else {
|
||||
emitSafeMUL(src1_xmm, src2_xmm, scratch1);
|
||||
}
|
||||
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
|
@ -662,23 +733,31 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
|
|||
loadRegister<2>(src2_xmm, shader, src2, isMADI ? 0 : idx, operandDescriptor);
|
||||
loadRegister<3>(src3_xmm, shader, src3, isMADI ? idx : 0, operandDescriptor);
|
||||
|
||||
// TODO: Implement safe PICA mul
|
||||
// If we have FMA3, optimize MAD to use FMA
|
||||
if (haveFMA3) {
|
||||
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
// If we don't have FMA3, do a multiplication and addition
|
||||
else {
|
||||
// Multiply src1 * src2
|
||||
if (haveAVX) {
|
||||
vmulps(scratch1, src1_xmm, src2_xmm);
|
||||
} else {
|
||||
movaps(scratch1, src1_xmm);
|
||||
mulps(scratch1, src2_xmm);
|
||||
if (!useSafeMUL) {
|
||||
if (haveFMA3) {
|
||||
vfmadd213ps(src1_xmm, src2_xmm, src3_xmm);
|
||||
storeRegister(src1_xmm, shader, dest, operandDescriptor);
|
||||
}
|
||||
|
||||
// If we don't have FMA3, do a multiplication and addition
|
||||
else {
|
||||
// Multiply src1 * src2
|
||||
if (haveAVX) {
|
||||
vmulps(scratch1, src1_xmm, src2_xmm);
|
||||
} else {
|
||||
movaps(scratch1, src1_xmm);
|
||||
mulps(scratch1, src2_xmm);
|
||||
}
|
||||
|
||||
// Add src3
|
||||
addps(scratch1, src3_xmm);
|
||||
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||
}
|
||||
} else {
|
||||
movaps(scratch1, src1_xmm);
|
||||
emitSafeMUL(scratch1, src2_xmm, src1_xmm);
|
||||
|
||||
// Add src3
|
||||
addps(scratch1, src3_xmm);
|
||||
storeRegister(scratch1, shader, dest, operandDescriptor);
|
||||
|
@ -1115,6 +1194,41 @@ Xbyak::Label ShaderEmitter::emitLog2Func() {
|
|||
return subroutine;
|
||||
}
|
||||
|
||||
void ShaderEmitter::emitSafeMUL(Xmm src1, Xmm src2, Xmm scratch) {
|
||||
// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
|
||||
// This can be done by checking for NaNs before and after a multiplication
|
||||
// To do this we can create a mask of which components of src1/src2 are NOT NaN using cmpordsps (cmpps with imm = 7)
|
||||
// Then we multiply src1 and src2 and reate a mask of which components of the result ARE NaN using cmpunordps
|
||||
// If the NaNs didn't exist (ie they were created by 0 * inf) before then we set them to 0 by XORing the 2 masks and ANDing the multiplication
|
||||
// result with the xor result
|
||||
// Based on Citra implementation, particularly the AVX-512 version
|
||||
|
||||
if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
|
||||
const Xbyak::Opmask zeroMask = k1;
|
||||
|
||||
vmulps(scratch, src1, src2);
|
||||
// Mask of any NaN values found in the result
|
||||
vcmpunordps(zeroMask, scratch, scratch);
|
||||
// Mask of any non-NaN inputs producing NaN results
|
||||
vcmpordps(zeroMask | zeroMask, src1, src2);
|
||||
|
||||
knotb(zeroMask, zeroMask);
|
||||
vmovaps(src1 | zeroMask | T_z, scratch);
|
||||
} else {
|
||||
if (haveAVX) {
|
||||
vcmpordps(scratch, src1, src2);
|
||||
} else {
|
||||
movaps(scratch, src1);
|
||||
cmpordps(scratch, src2);
|
||||
}
|
||||
|
||||
mulps(src1, src2);
|
||||
cmpunordps(src2, src1);
|
||||
xorps(src2, scratch);
|
||||
andps(src1, src2);
|
||||
}
|
||||
}
|
||||
|
||||
Xbyak::Label ShaderEmitter::emitExp2Func() {
|
||||
Xbyak::Label subroutine;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue