From 8a13b8c878917f4d6f59ddf683c80718340e0656 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 30 Jun 2023 02:02:56 +0300 Subject: [PATCH] [Shader JIT] Compile MAD to FMA when possible --- .../PICA/dynapica/shader_rec_emitter_x64.hpp | 2 ++ .../PICA/dynapica/shader_rec_emitter_x64.cpp | 27 ++++++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 4ce2942a..3f33da06 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -36,6 +36,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator { bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX) + bool haveFMA3 = false; // Shows if the CPU supports FMA3 // Compile all instructions from [current recompiler PC, end) void compileUntil(const PICAShader& shaderUnit, u32 endPC); @@ -112,6 +113,7 @@ public: haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41); haveAVX = cpu.has(Xbyak::util::Cpu::tAVX); + haveFMA3 = cpu.has(Xbyak::util::Cpu::tFMA); if (!cpu.has(Xbyak::util::Cpu::tSSE3)) { Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead"); diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index cecd740c..8ccc4838 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -580,17 +580,26 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { loadRegister<3>(src3_xmm, shader, src3, 0, operandDescriptor); // TODO: Implement safe PICA mul - // Multiply src1 * src2 - if (haveAVX) { - vmulps(scratch1, src1_xmm, src2_xmm); - } else { - movaps(scratch1, src1_xmm); - mulps(scratch1, src2_xmm); + // If we have FMA3, optimize MAD to use FMA + if (haveFMA3) { + vfmadd213ps(src1_xmm, src2_xmm, src3_xmm); + storeRegister(src1_xmm, shader, dest, operandDescriptor); } + + // If we don't have FMA3, do a multiplication and addition + else { + // Multiply src1 * src2 + if (haveAVX) { + vmulps(scratch1, src1_xmm, src2_xmm); + } else { + movaps(scratch1, src1_xmm); + mulps(scratch1, src2_xmm); + } - // Add src3 - addps(scratch1, src3_xmm); - storeRegister(scratch1, shader, dest, operandDescriptor); + // Add src3 + addps(scratch1, src3_xmm); + storeRegister(scratch1, shader, dest, operandDescriptor); + } } void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {