diff --git a/include/PICA/shader_unit.hpp b/include/PICA/shader_unit.hpp index d8d93160..80e01346 100644 --- a/include/PICA/shader_unit.hpp +++ b/include/PICA/shader_unit.hpp @@ -2,10 +2,9 @@ #include "PICA/shader.hpp" class ShaderUnit { - -public: - PICAShader vs; // Vertex shader - PICAShader gs; // Geometry shader + public: + PICAShader vs; // Vertex shader + PICAShader gs; // Geometry shader ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {} void reset(); diff --git a/include/renderer.hpp b/include/renderer.hpp index 569a730b..1d1fb682 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -21,9 +21,11 @@ enum class RendererType : s8 { }; struct EmulatorConfig; -class GPU; struct SDL_Window; +class GPU; +class ShaderUnit; + class Renderer { protected: GPU& gpu; @@ -77,7 +79,10 @@ class Renderer { virtual std::string getUbershader() { return ""; } virtual void setUbershader(const std::string& shader) {} - virtual void setUbershaderSetting(bool value) {} + // This function is called on every draw call before parsing vertex data. + // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between + // ubershaders and shadergen, and so on. + virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {} // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index f5a964a3..6c18a0c6 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -30,7 +30,6 @@ class RendererGL final : public Renderer { OpenGL::VertexArray vao; OpenGL::VertexBuffer vbo; - bool enableUbershader = true; // Data struct { @@ -110,8 +109,7 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - - virtual void setUbershaderSetting(bool value) override { enableUbershader = value; } + virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index fe336edc..b6d903e4 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -117,13 +117,13 @@ void GPU::reset() { externalRegs[Framebuffer1Config] = static_cast(PICA::ColorFmt::RGB8); externalRegs[Framebuffer1Select] = 0; - renderer->setUbershaderSetting(config.useUbershaders); renderer->reset(); } // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { + renderer->prepareForDraw(shaderUnit, false); const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; if (indexed) { diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index f805de60..c9412fc8 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { + renderer->prepareForDraw(shaderUnit, true); renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 482aa36c..ce7d9a33 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -72,19 +72,17 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) { void ShaderDecompiler::writeAttributes() { decompiledShader += R"( - layout(location = 0) in vec4 inputs[8]; + layout(location = 0) in vec4 inputs[8]; + layout(std140) uniform PICAShaderUniforms { + vec4 uniform_float[96]; + uvec4 uniform_int; + uint uniform_bool; + }; - layout(std140) uniform PICAShaderUniforms { - vec4 uniform_float[96]; - uvec4 uniform_int; - uint uniform_bool; - }; - - vec4 temp_registers[16]; - vec4 dummy_vec = vec4(0.0); + vec4 temp_registers[16]; + vec4 output_registers[8]; + vec4 dummy_vec = vec4(0.0); )"; - - decompiledShader += "\n"; } std::string ShaderDecompiler::decompile() { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 8b614d2d..90eccf47 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -4,11 +4,12 @@ #include -#include "config.hpp" #include "PICA/float_types.hpp" -#include "PICA/pica_frag_uniforms.hpp" #include "PICA/gpu.hpp" +#include "PICA/pica_frag_uniforms.hpp" #include "PICA/regs.hpp" +#include "PICA/shader_decompiler.hpp" +#include "config.hpp" #include "math_util.hpp" CMRC_DECLARE(RendererGL); @@ -409,25 +410,6 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::Triangle, }; - bool usingUbershader = enableUbershader; - if (usingUbershader) { - const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; - const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; - - // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen - // This way we generate fewer shaders overall than with full shadergen, but don't tank performance - if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { - usingUbershader = false; - } - } - - if (usingUbershader) { - gl.useProgram(triangleProgram); - } else { - OpenGL::Program& program = getSpecializedShader(); - gl.useProgram(program); - } - const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); gl.bindVBO(vbo); @@ -449,38 +431,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const int depthFunc = getBits<4, 3>(depthControl); const int colourMask = getBits<8, 4>(depthControl); gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8); - static constexpr std::array depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL}; - // Update ubershader uniforms - if (usingUbershader) { - const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); - const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); - const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; - - if (oldDepthScale != depthScale) { - oldDepthScale = depthScale; - glUniform1f(ubershaderData.depthScaleLoc, depthScale); - } - - if (oldDepthOffset != depthOffset) { - oldDepthOffset = depthOffset; - glUniform1f(ubershaderData.depthOffsetLoc, depthOffset); - } - - if (oldDepthmapEnable != depthMapEnable) { - oldDepthmapEnable = depthMapEnable; - glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable); - } - - // Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48) - // The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates - glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); - setupUbershaderTexEnv(); - } - bindTexturesToSlots(); - if (gpu.fogLUTDirty) { updateFogLUT(); } @@ -951,6 +904,62 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } +void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { + std::string vertShaderSource = PICA::ShaderGen::decompileShader( + shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL + ); + + OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex); + //triangleProgram.create({vert, frag}); + std::cout << vertShaderSource << "\n"; + + bool usingUbershader = emulatorConfig->useUbershaders; + if (usingUbershader) { + const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; + const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; + + // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen + // This way we generate fewer shaders overall than with full shadergen, but don't tank performance + if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { + usingUbershader = false; + } + } + + if (usingUbershader) { + gl.useProgram(triangleProgram); + } else { + OpenGL::Program& program = getSpecializedShader(); + gl.useProgram(program); + } + + // Update ubershader uniforms + if (usingUbershader) { + const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); + const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); + const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; + + if (oldDepthScale != depthScale) { + oldDepthScale = depthScale; + glUniform1f(ubershaderData.depthScaleLoc, depthScale); + } + + if (oldDepthOffset != depthOffset) { + oldDepthOffset = depthOffset; + glUniform1f(ubershaderData.depthOffsetLoc, depthOffset); + } + + if (oldDepthmapEnable != depthMapEnable) { + oldDepthmapEnable = depthMapEnable; + glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable); + } + + // Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48) + // The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates + glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); + setupUbershaderTexEnv(); + } +} + void RendererGL::screenshot(const std::string& name) { constexpr uint width = 400; constexpr uint height = 2 * 240;