[Shader JIT] Mix in AVX-128 in some places, fix cmp

This commit is contained in:
wheremyfoodat 2023-06-11 01:49:08 +03:00
parent 18df6f9531
commit 48b2af6a17
2 changed files with 29 additions and 10 deletions

View file

@ -32,6 +32,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1
bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX)
// Compile all instructions from [current recompiler PC, end) // Compile all instructions from [current recompiler PC, end)
void compileUntil(const PICAShader& shaderUnit, u32 endPC); void compileUntil(const PICAShader& shaderUnit, u32 endPC);
@ -101,6 +102,8 @@ public:
const auto cpu = Xbyak::util::Cpu(); const auto cpu = Xbyak::util::Cpu();
haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41); haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
haveAVX = cpu.has(Xbyak::util::Cpu::tAVX);
if (!cpu.has(Xbyak::util::Cpu::tSSE3)) { if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead"); Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
} }

View file

@ -269,8 +269,12 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest
if (index == 0) { // Bottom lane, no need to shift if (index == 0) { // Bottom lane, no need to shift
movss(dword[statePointer + lane_offset], source); movss(dword[statePointer + lane_offset], source);
} else { // Shift right by 32 * index, then write bottom lane } else { // Shift right by 32 * index, then write bottom lane
movaps(scratch1, source); if (haveAVX) {
psrldq(scratch1, index * sizeof(float)); vpsrldq(scratch1, source, index * sizeof(float));
} else {
movaps(scratch1, source);
psrldq(scratch1, index * sizeof(float));
}
movss(dword[statePointer + lane_offset], scratch1); movss(dword[statePointer + lane_offset], scratch1);
} }
} else if (haveSSE4_1) { } else if (haveSSE4_1) {
@ -505,9 +509,16 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
loadRegister<2>(src2_xmm, shader, src2, idx, operandDescriptor); loadRegister<2>(src2_xmm, shader, src2, idx, operandDescriptor);
loadRegister<3>(src3_xmm, shader, src3, 0, operandDescriptor); loadRegister<3>(src3_xmm, shader, src3, 0, operandDescriptor);
movaps(scratch1, src1_xmm);
// TODO: Implement safe PICA mul // TODO: Implement safe PICA mul
mulps(scratch1, src2_xmm); // Multiply src1 * src2
if (haveAVX) {
vmulps(scratch1, src1_xmm, src2_xmm);
} else {
movaps(scratch1, src1_xmm);
mulps(scratch1, src2_xmm);
}
// Add src3
addps(scratch1, src3_xmm); addps(scratch1, src3_xmm);
storeRegister(scratch1, shader, dest, operandDescriptor); storeRegister(scratch1, shader, dest, operandDescriptor);
} }
@ -564,18 +575,23 @@ void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
shr(rax, 32); // Check top 32 bits (shr will set the zero flag properly) shr(rax, 32); // Check top 32 bits (shr will set the zero flag properly)
setne(byte[statePointer + cmpRegYOffset]); // set cmp.y setne(byte[statePointer + cmpRegYOffset]); // set cmp.y
} else { } else {
movaps(scratch1, lhs_x); // Copy the left hand operands to temp registers if (haveAVX) {
movaps(scratch2, lhs_y); vcmpps(scratch1, lhs_x, rhs_x, compareFuncX); // Perform comparison for X component and store result in scratch1
vcmpps(scratch2, lhs_y, rhs_y, compareFuncY); // Perform comparison for Y component and store result in scratch2
} else {
movaps(scratch1, lhs_x); // Copy the left hand operands to temp registers
movaps(scratch2, lhs_y);
cmpps(scratch1, rhs_x, compareFuncX); // Perform the compares cmpps(scratch1, rhs_x, compareFuncX); // Perform the compares
cmpps(scratch2, rhs_y, compareFuncY); cmpps(scratch2, rhs_y, compareFuncY);
}
movd(eax, scratch1); // Move results to eax for X and edx for Y movd(eax, scratch1); // Move results to eax for X and edx for Y
movd(edx, scratch2); movq(rdx, scratch2);
test(eax, eax); // Write back results with setne test(eax, eax); // Write back results with setne
setne(byte[statePointer + cmpRegXOffset]); setne(byte[statePointer + cmpRegXOffset]);
test(edx, edx); shr(rdx, 32); // We want the y component for the second comparison. This shift will set zero flag to 0 if the comparison is true
setne(byte[statePointer + cmpRegYOffset]); setne(byte[statePointer + cmpRegYOffset]);
} }
} }