Merge pull request #537 from wheremyfoodat/accurate-mul

Add accurate shader multiplication option
This commit is contained in:
wheremyfoodat 2024-07-16 19:46:12 +00:00 committed by GitHub
commit 71b8031d5b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 25 additions and 17 deletions

View file

@ -22,8 +22,11 @@ class ShaderJIT {
ShaderCache cache;
#endif
bool accurateMul = false;
public:
void setAccurateMul(bool value) { accurateMul = value; }
#ifdef PANDA3DS_SHADER_JIT_SUPPORTED
// Call this before starting to process a batch of vertices
// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
@ -36,11 +39,11 @@ class ShaderJIT {
static constexpr bool isAvailable() { return true; }
#else
void prepare(PICAShader& shaderUnit) {
Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit");
Helpers::panic("Shader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit");
}
void run(PICAShader& shaderUnit) {
Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit");
Helpers::panic("Shader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit");
}
// Define dummy callback. This should never be called if the shader JIT is not supported

View file

@ -37,6 +37,8 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
// Shows whether the loaded shader has any log2 and exp2 instructions
bool codeHasLog2 = false;
bool codeHasExp2 = false;
// Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul
bool useSafeMUL = false;
oaknut::Label log2Func, exp2Func;
oaknut::Label emitLog2Func();
@ -123,7 +125,7 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
PrologueCallback prologueCb = nullptr;
// Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer
ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}
ShaderEmitter(bool useSafeMUL) : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()), useSafeMUL(useSafeMUL) {}
// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
InstructionCallback getInstructionCallback(u32 pc) { return getLabelPointer<InstructionCallback>(instructionLabels.at(pc)); }

View file

@ -45,6 +45,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
// Shows whether the loaded shader has any log2 and exp2 instructions
bool codeHasLog2 = false;
bool codeHasExp2 = false;
// Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul
bool useSafeMUL = false;
Xbyak::Label log2Func, exp2Func;
Xbyak::Label emitLog2Func();
@ -130,7 +132,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
PrologueCallback prologueCb = nullptr;
// Initialize our emitter with "allocSize" bytes of RWX memory
ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
ShaderEmitter(bool useSafeMUL) : Xbyak::CodeGenerator(allocSize), useSafeMUL(useSafeMUL) {
cpuCaps = Xbyak::util::Cpu();
haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41);

View file

@ -15,6 +15,7 @@ struct EmulatorConfig {
bool shaderJitEnabled = shaderJitDefault;
bool discordRpcEnabled = false;
bool accurateShaderMul = false;
RendererType rendererType = RendererType::OpenGL;
Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;

View file

@ -62,6 +62,7 @@ void EmulatorConfig::load() {
shaderJitEnabled = toml::find_or<toml::boolean>(gpu, "EnableShaderJIT", shaderJitDefault);
vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
}
}
@ -125,6 +126,7 @@ void EmulatorConfig::save() {
data["GPU"]["EnableShaderJIT"] = shaderJitEnabled;
data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType));
data["GPU"]["EnableVSync"] = vsyncEnabled;
data["GPU"]["AccurateShaderMultiplication"] = accurateShaderMul;
data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
data["Audio"]["EnableAudio"] = audioEnabled;

View file

@ -16,7 +16,7 @@ void ShaderJIT::prepare(PICAShader& shaderUnit) {
auto it = cache.find(hash);
if (it == cache.end()) { // Block has not been compiled yet
auto emitter = std::make_unique<ShaderEmitter>();
auto emitter = std::make_unique<ShaderEmitter>(accurateMul);
emitter->compile(shaderUnit);
// Get pointer to callbacks
entrypointCallback = emitter->getInstructionCallback(shaderUnit.entrypoint);

View file

@ -7,9 +7,6 @@ using namespace Helpers;
using namespace oaknut;
using namespace oaknut::util;
// TODO: Expose safe/unsafe optimizations to the user
constexpr bool useSafeMUL = true;
// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
static constexpr QReg src1Vec = Q1;
@ -491,7 +488,7 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
// Now do a full DP4
// Do a piecewise multiplication of the vectors first
if constexpr (useSafeMUL) {
if (useSafeMUL) {
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
} else {
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
@ -518,7 +515,7 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
// Do a piecewise multiplication of the vectors first
if constexpr (useSafeMUL) {
if (useSafeMUL) {
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
} else {
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
@ -551,7 +548,7 @@ void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
// Now perform a DP4
// Do a piecewise multiplication of the vectors first
if constexpr (useSafeMUL) {
if (useSafeMUL) {
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
} else {
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
@ -834,7 +831,7 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
loadRegister<1>(src1Vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2Vec, shader, src2, 0, operandDescriptor);
if constexpr (useSafeMUL) {
if (useSafeMUL) {
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
} else {
FMUL(src1Vec.S4(), src1Vec.S4(), src2Vec.S4());
@ -907,7 +904,7 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
loadRegister<2>(src2Vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
loadRegister<3>(src3Vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
if constexpr (useSafeMUL) {
if (useSafeMUL) {
emitSafeMUL(src1Vec, src2Vec, scratch1Vec);
FADD(src3Vec.S4(), src3Vec.S4(), src1Vec.S4());
} else {

View file

@ -12,9 +12,6 @@ using namespace Xbyak;
using namespace Xbyak::util;
using namespace Helpers;
// TODO: Expose safe/unsafe optimizations to the user
constexpr bool useSafeMUL = false;
// The shader recompiler uses quite an odd internal ABI
// We make use of the fact that in regular conditions, we should pretty much never be calling C++ code from recompiled shader code
// This allows us to establish an ABI that's optimized for this sort of workflow, statically allocating volatile host registers

View file

@ -64,6 +64,8 @@ void GPU::reset() {
regs.fill(0);
shaderUnit.reset();
shaderJIT.reset();
shaderJIT.setAccurateMul(config.accurateShaderMul);
std::memset(vram, 0, vramSize);
lightingLUT.fill(0);
lightingLUTDirty = true;

View file

@ -146,6 +146,7 @@ static bool FetchVariableBool(std::string key, bool def) {
static void configInit() {
static const retro_variable values[] = {
{"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"},
{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
{"panda3ds_use_vsync", "Enable VSync; enabled|disabled"},
{"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"},
{"panda3ds_use_audio", "Enable audio; disabled|enabled"},
@ -153,7 +154,7 @@ static void configInit() {
{"panda3ds_write_protect_virtual_sd", "Write protect virtual SD card; disabled|enabled"},
{"panda3ds_battery_level", "Battery percentage; 5|10|20|30|50|70|90|100"},
{"panda3ds_use_charger", "Charger plugged; enabled|disabled"},
{nullptr, nullptr}
{nullptr, nullptr},
};
envCallbacks(RETRO_ENVIRONMENT_SET_VARIABLES, (void*)values);
@ -171,6 +172,7 @@ static void configUpdate() {
config.audioEnabled = FetchVariableBool("panda3ds_use_audio", false);
config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true);
config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false);
config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false);
config.discordRpcEnabled = false;
config.save();