diff --git a/include/PICA/dynapica/shader_rec.hpp b/include/PICA/dynapica/shader_rec.hpp index 2dabc128..a242d02f 100644 --- a/include/PICA/dynapica/shader_rec.hpp +++ b/include/PICA/dynapica/shader_rec.hpp @@ -22,8 +22,11 @@ class ShaderJIT { ShaderCache cache; #endif + bool accurateMul = false; public: + void setAccurateMul(bool value) { accurateMul = value; } + #ifdef PANDA3DS_SHADER_JIT_SUPPORTED // Call this before starting to process a batch of vertices // This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader @@ -36,11 +39,11 @@ class ShaderJIT { static constexpr bool isAvailable() { return true; } #else void prepare(PICAShader& shaderUnit) { - Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit"); + Helpers::panic("Shader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit"); } void run(PICAShader& shaderUnit) { - Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit"); + Helpers::panic("Shader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit"); } // Define dummy callback. This should never be called if the shader JIT is not supported diff --git a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp index 7411c430..9351f383 100644 --- a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp @@ -37,6 +37,8 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator { // Shows whether the loaded shader has any log2 and exp2 instructions bool codeHasLog2 = false; bool codeHasExp2 = false; + // Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul + bool useSafeMUL = false; oaknut::Label log2Func, exp2Func; oaknut::Label emitLog2Func(); @@ -123,7 +125,7 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator { PrologueCallback prologueCb = nullptr; // Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer - ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {} + ShaderEmitter(bool useSafeMUL) : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()), useSafeMUL(useSafeMUL) {} // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does InstructionCallback getInstructionCallback(u32 pc) { return getLabelPointer(instructionLabels.at(pc)); } diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index 1052d6a0..a43bd2dc 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -45,6 +45,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Shows whether the loaded shader has any log2 and exp2 instructions bool codeHasLog2 = false; bool codeHasExp2 = false; + // Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul + bool useSafeMUL = false; Xbyak::Label log2Func, exp2Func; Xbyak::Label emitLog2Func(); @@ -130,7 +132,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator { PrologueCallback prologueCb = nullptr; // Initialize our emitter with "allocSize" bytes of RWX memory - ShaderEmitter() : Xbyak::CodeGenerator(allocSize) { + ShaderEmitter(bool useSafeMUL) : Xbyak::CodeGenerator(allocSize), useSafeMUL(useSafeMUL) { cpuCaps = Xbyak::util::Cpu(); haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41); diff --git a/include/config.hpp b/include/config.hpp index 339e651c..6dbae9e3 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -15,6 +15,7 @@ struct EmulatorConfig { bool shaderJitEnabled = shaderJitDefault; bool discordRpcEnabled = false; + bool accurateShaderMul = false; RendererType rendererType = RendererType::OpenGL; Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null; diff --git a/src/config.cpp b/src/config.cpp index 2f9b7e00..5af4d654 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -62,6 +62,7 @@ void EmulatorConfig::load() { shaderJitEnabled = toml::find_or(gpu, "EnableShaderJIT", shaderJitDefault); vsyncEnabled = toml::find_or(gpu, "EnableVSync", true); + accurateShaderMul = toml::find_or(gpu, "AccurateShaderMultiplication", false); } } @@ -125,6 +126,7 @@ void EmulatorConfig::save() { data["GPU"]["EnableShaderJIT"] = shaderJitEnabled; data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType)); data["GPU"]["EnableVSync"] = vsyncEnabled; + data["GPU"]["AccurateShaderMultiplication"] = accurateShaderMul; data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType)); data["Audio"]["EnableAudio"] = audioEnabled; diff --git a/src/core/PICA/dynapica/shader_rec.cpp b/src/core/PICA/dynapica/shader_rec.cpp index 20e171d7..e3c13c1e 100644 --- a/src/core/PICA/dynapica/shader_rec.cpp +++ b/src/core/PICA/dynapica/shader_rec.cpp @@ -16,7 +16,7 @@ void ShaderJIT::prepare(PICAShader& shaderUnit) { auto it = cache.find(hash); if (it == cache.end()) { // Block has not been compiled yet - auto emitter = std::make_unique(); + auto emitter = std::make_unique(accurateMul); emitter->compile(shaderUnit); // Get pointer to callbacks entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint); diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp index 15200e76..296ec932 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp @@ -7,9 +7,6 @@ using namespace Helpers; using namespace oaknut; using namespace oaknut::util; -// TODO: Expose safe/unsafe optimizations to the user -constexpr bool useSafeMUL = true; - // Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions // So to avoid pushing and popping, we'll be making use of volatile registers as much as possible static constexpr QReg src1Vec = Q1; @@ -491,7 +488,7 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) { // Now do a full DP4 // Do a piecewise multiplication of the vectors first - if constexpr (useSafeMUL) { + if (useSafeMUL) { emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); @@ -518,7 +515,7 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) { loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); // Do a piecewise multiplication of the vectors first - if constexpr (useSafeMUL) { + if (useSafeMUL) { emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); @@ -551,7 +548,7 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) { // Now perform a DP4 // Do a piecewise multiplication of the vectors first - if constexpr (useSafeMUL) { + if (useSafeMUL) { emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); @@ -834,7 +831,7 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) { loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor); loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor); - if constexpr (useSafeMUL) { + if (useSafeMUL) { emitSafeMUL(src1Vec, src2Vec, scratch1Vec); } else { FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4()); @@ -907,7 +904,7 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) { loadRegister<2>(src2Vec, shader, src2, isMADI ? 0 : idx, operandDescriptor); loadRegister<3>(src3Vec, shader, src3, isMADI ? idx : 0, operandDescriptor); - if constexpr (useSafeMUL) { + if (useSafeMUL) { emitSafeMUL(src1Vec, src2Vec, scratch1Vec); FADD(src3Vec.S4(), src3Vec.S4(), src1Vec.S4()); } else { diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index e7bafe9f..142ff8c8 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -12,9 +12,6 @@ using namespace Xbyak; using namespace Xbyak::util; using namespace Helpers; -// TODO: Expose safe/unsafe optimizations to the user -constexpr bool useSafeMUL = false; - // The shader recompiler uses quite an odd internal ABI // We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code // This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index a777d0a3..ed0e5420 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -64,6 +64,8 @@ void GPU::reset() { regs.fill(0); shaderUnit.reset(); shaderJIT.reset(); + shaderJIT.setAccurateMul(config.accurateShaderMul); + std::memset(vram, 0, vramSize); lightingLUT.fill(0); lightingLUTDirty = true; diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp index f9772b37..3825d3ed 100644 --- a/src/libretro_core.cpp +++ b/src/libretro_core.cpp @@ -146,6 +146,7 @@ static bool FetchVariableBool(std::string key, bool def) { static void configInit() { static const retro_variable values[] = { {"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"}, + {"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"}, {"panda3ds_use_vsync", "Enable VSync; enabled|disabled"}, {"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"}, {"panda3ds_use_audio", "Enable audio; disabled|enabled"}, @@ -153,7 +154,7 @@ static void configInit() { {"panda3ds_write_protect_virtual_sd", "Write protect virtual SD card; disabled|enabled"}, {"panda3ds_battery_level", "Battery percentage; 5|10|20|30|50|70|90|100"}, {"panda3ds_use_charger", "Charger plugged; enabled|disabled"}, - {nullptr, nullptr} + {nullptr, nullptr}, }; envCallbacks(RETRO_ENVIRONMENT_SET_VARIABLES, (void*)values); @@ -171,6 +172,7 @@ static void configUpdate() { config.audioEnabled = FetchVariableBool("panda3ds_use_audio", false); config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true); config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false); + config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false); config.discordRpcEnabled = false; config.save();