diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c2ec9f1..a43b7f63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,6 +256,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp + include/PICA/pica_vert_config.hpp ) cmrc_add_resource_library( diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index ac2a49e6..1e1d3c4b 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -13,6 +13,12 @@ #include "memory.hpp" #include "renderer.hpp" +enum class ShaderExecMode { + Interpreter, // Interpret shaders on the CPU + JIT, // Recompile shaders to CPU machine code + Hardware, // Recompiler shaders to host shaders and run them on the GPU +}; + class GPU { static constexpr u32 regNum = 0x300; static constexpr u32 extRegNum = 0x1000; @@ -45,7 +51,7 @@ class GPU { uint immediateModeVertIndex; uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading - template + template void drawArrays(); // Silly method of avoiding linking problems. TODO: Change to something less silly diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp new file mode 100644 index 00000000..ae774405 --- /dev/null +++ b/include/PICA/pica_vert_config.hpp @@ -0,0 +1,31 @@ +#pragma once +#include +#include +#include +#include + +#include "PICA/pica_hash.hpp" +#include "PICA/regs.hpp" +#include "bitfield.hpp" +#include "helpers.hpp" + +namespace PICA { + // Configuration struct used + struct VertConfig { + PICAHash::HashType shaderHash; + PICAHash::HashType opdescHash; + u32 entrypoint; + bool usingUbershader; + + bool operator==(const VertConfig& config) const { + // Hash function and equality operator required by std::unordered_map + return std::memcmp(this, &config, sizeof(VertConfig)) == 0; + } + }; +} // namespace PICA + +// Override std::hash for our vertex config class +template <> +struct std::hash { + std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); } +}; \ No newline at end of file diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index 68b16de8..c725c180 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -107,6 +107,11 @@ class PICAShader { alignas(16) std::array inputs; // Attributes passed to the shader alignas(16) std::array outputs; alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT + + // We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT + // We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal + // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first + using Hash = PICAHash::HashType; protected: std::array operandDescriptors; @@ -125,11 +130,6 @@ class PICAShader { std::array callInfo; ShaderType type; - // We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT - // We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal - // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first - using Hash = PICAHash::HashType; - Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT) diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp index 215e5adb..2d39e078 100644 --- a/include/PICA/shader_gen.hpp +++ b/include/PICA/shader_gen.hpp @@ -30,6 +30,8 @@ namespace PICA::ShaderGen { FragmentGenerator(API api, Language language) : api(api), language(language) {} std::string generate(const PICA::FragmentConfig& config); std::string getDefaultVertexShader(); + // For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader + std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader); void setTarget(API api, Language language) { this->api = api; diff --git a/include/renderer.hpp b/include/renderer.hpp index 1d1fb682..721364c1 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -82,7 +82,8 @@ class Renderer { // This function is called on every draw call before parsing vertex data. // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between // ubershaders and shadergen, and so on. - virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {} + // Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU + virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; } // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 6c18a0c6..0597235b 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -3,11 +3,14 @@ #include #include #include +#include #include #include +#include #include "PICA/float_types.hpp" #include "PICA/pica_frag_config.hpp" +#include "PICA/pica_vert_config.hpp" #include "PICA/pica_hash.hpp" #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" @@ -52,6 +55,11 @@ class RendererGL final : public Renderer { float oldDepthScale = -1.0; float oldDepthOffset = 0.0; bool oldDepthmapEnable = false; + // Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader + bool usingAcceleratedShader = false; + + // Cached pointer to the current vertex shader when using HW accelerated shaders + OpenGL::Shader* generatedVertexShader = nullptr; SurfaceCache depthBufferCache; SurfaceCache colourBufferCache; @@ -74,7 +82,38 @@ class RendererGL final : public Renderer { OpenGL::Program program; uint uboBinding; }; - std::unordered_map shaderCache; + + struct ShaderCache { + std::unordered_map> vertexShaderCache; + std::unordered_map fragmentShaderCache; + + // Program cache indexed by GLuints for the vertex and fragment shader to use + // Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint + std::unordered_map programCache; + + void clear() { + for (auto& it : programCache) { + CachedProgram& cachedProgram = it.second; + cachedProgram.program.free(); + glDeleteBuffers(1, &cachedProgram.uboBinding); + } + + for (auto& it : vertexShaderCache) { + if (it.second.has_value()) { + it.second->free(); + } + } + + for (auto& it : fragmentShaderCache) { + it.second.free(); + } + + programCache.clear(); + vertexShaderCache.clear(); + fragmentShaderCache.clear(); + } + }; + ShaderCache shaderCache; OpenGL::Framebuffer getColourFBO(); OpenGL::Texture getTexture(Texture& tex); @@ -109,14 +148,13 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; + virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); // Note: The caller is responsible for deleting the currently bound FBO before calling this void setFBO(uint handle) { screenFramebuffer.m_handle = handle; } void resetStateManager() { gl.reset(); } - void clearShaderCache(); void initUbershader(OpenGL::Program& program); #ifdef PANDA3DS_FRONTEND_QT diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index b6d903e4..077c65aa 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -123,27 +123,38 @@ void GPU::reset() { // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { - renderer->prepareForDraw(shaderUnit, false); - const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; + const bool hwShaders = renderer->prepareForDraw(shaderUnit, false); - if (indexed) { - if (shaderJITEnabled) - drawArrays(); - else - drawArrays(); + if (hwShaders) { + if (indexed) { + drawArrays(); + } else { + drawArrays(); + } } else { - if (shaderJITEnabled) - drawArrays(); - else - drawArrays(); + const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; + + if (indexed) { + if (shaderJITEnabled) { + drawArrays(); + } else { + drawArrays(); + } + } else { + if (shaderJITEnabled) { + drawArrays(); + } else { + drawArrays(); + } + } } } static std::array vertices; -template +template void GPU::drawArrays() { - if constexpr (useShaderJIT) { + if constexpr (mode == ShaderExecMode::JIT) { shaderJIT.prepare(shaderUnit.vs); } @@ -322,29 +333,38 @@ void GPU::drawArrays() { } } - // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers - // Based on the SH_ATTRIBUTES_PERMUTATION registers. - // Ie it might attribute #0 to v2, #1 to v7, etc - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); - } + // Running shader on the CPU instead of the GPU + if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) { + // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers + // Based on the SH_ATTRIBUTES_PERMUTATION registers. + // Ie it might map attribute #0 to v2, #1 to v7, etc + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); + } - if constexpr (useShaderJIT) { - shaderJIT.run(shaderUnit.vs); - } else { - shaderUnit.vs.run(); - } + if constexpr (mode == ShaderExecMode::JIT) { + shaderJIT.run(shaderUnit.vs); + } else { + shaderUnit.vs.run(); + } - PICA::Vertex& out = vertices[i]; - // Map shader outputs to fixed function properties - const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; - for (int i = 0; i < totalShaderOutputs; i++) { - const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + PICA::Vertex& out = vertices[i]; + // Map shader outputs to fixed function properties + const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; + for (int i = 0; i < totalShaderOutputs; i++) { + const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; - for (int j = 0; j < 4; j++) { // pls unroll - const u32 mapping = (config >> (j * 8)) & 0x1F; - out.raw[mapping] = vsOutputRegisters[i][j]; + for (int j = 0; j < 4; j++) { // pls unroll + const u32 mapping = (config >> (j * 8)) & 0x1F; + out.raw[mapping] = vsOutputRegisters[i][j]; + } + } + } else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly + PICA::Vertex& out = vertices[i]; + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&out.raw[mapping], ¤tAttributes[j], sizeof(vec4f)); } } } diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index 9802be90..d4deee35 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -72,11 +72,6 @@ std::string FragmentGenerator::getDefaultVertexShader() { out float gl_ClipDistance[2]; #endif - vec4 abgr8888ToVec4(uint abgr) { - const float scale = 1.0 / 255.0; - return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24)); - } - void main() { gl_Position = a_coords; vec4 colourAbs = abs(a_vertexColour); @@ -677,4 +672,58 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf shader += "vec2 value = texelFetch(u_tex_luts, ivec2(int(clamped_index), 24), 0).rg;"; // fog LUT is past the light LUTs shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);"; shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);"; +} + +std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) { + if (usingUbershader) { + Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader"); + return picaSource; + } else { + // TODO: Uniforms and don't hardcode fixed-function semantic indices... + std::string ret = picaSource; + if (api == API::GLES) { + ret += "\n#define USING_GLES\n"; + } + + ret += R"( +out vec4 v_quaternion; +out vec4 v_colour; +out vec3 v_texcoord0; +out vec2 v_texcoord1; +out vec3 v_view; +out vec2 v_texcoord2; + +#ifndef USING_GLES + out float gl_ClipDistance[2]; +#endif + +void main() { + pica_shader_main(); + vec4 a_coords = output_registers[0]; + vec4 a_vertexColour = output_registers[1]; + vec2 a_texcoord0 = output_registers[2].xy; + float a_texcoord0_w = output_registers[2].w; + vec2 a_texcoord1 = output_registers[3].xy; + vec2 a_texcoord2 = output_registers[4].xy; + vec3 a_view = output_registers[5].xyz; + vec4 a_quaternion = output_registers[6]; + + gl_Position = a_coords; + vec4 colourAbs = abs(a_vertexColour); + v_colour = min(colourAbs, vec4(1.f)); + + v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w); + v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y); + v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y); + v_view = a_view; + v_quaternion = a_quaternion; + +#ifndef USING_GLES + //gl_ClipDistance[0] = -a_coords.z; + //gl_ClipDistance[1] = dot(clipCoords, a_coords); +#endif +})"; + + return ret; + } } \ No newline at end of file diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 90eccf47..c593ad96 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -25,7 +25,7 @@ void RendererGL::reset() { colourBufferCache.reset(); textureCache.reset(); - clearShaderCache(); + shaderCache.clear(); // Init the colour/depth buffer settings to some random defaults on reset colourBufferLoc = 0; @@ -788,18 +788,24 @@ OpenGL::Program& RendererGL::getSpecializedShader() { PICA::FragmentConfig fsConfig(regs); - CachedProgram& programEntry = shaderCache[fsConfig]; + OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig]; + if (!fragShader.exists()) { + std::string fs = fragShaderGen.generate(fsConfig); + fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment); + } + + // Get the handle of the current vertex shader + OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs; + // And form the key for looking up a shader program + const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle()); + + CachedProgram& programEntry = shaderCache.programCache[programKey]; OpenGL::Program& program = programEntry.program; if (!program.exists()) { - std::string fs = fragShaderGen.generate(fsConfig); - - OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); - program.create({defaultShadergenVs, fragShader}); + program.create({vertexShader, fragShader}); gl.useProgram(program); - fragShader.free(); - // Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3 glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0); glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); @@ -904,15 +910,8 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } -void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { - std::string vertShaderSource = PICA::ShaderGen::decompileShader( - shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL - ); - - OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex); - //triangleProgram.create({vert, frag}); - std::cout << vertShaderSource << "\n"; - +bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { + // First we figure out if we will be using an ubershader bool usingUbershader = emulatorConfig->useUbershaders; if (usingUbershader) { const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; @@ -925,6 +924,46 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { } } + // Then we figure out if we will use hw accelerated shaders, and try to fetch our shader + // TODO: Ubershader support for accelerated shaders + usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader; + + if (usingAcceleratedShader) { + auto shaderCodeHash = shaderUnit.vs.getCodeHash(); + auto opdescHash = shaderUnit.vs.getOpdescHash(); + auto vertexConfig = PICA::VertConfig{ + .shaderHash = shaderCodeHash, + .opdescHash = opdescHash, + .entrypoint = shaderUnit.vs.entrypoint, + .usingUbershader = usingUbershader, + }; + + std::optional& shader = shaderCache.vertexShaderCache[vertexConfig]; + // If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works. + if (!shader.has_value()) { + // Initialize shader to a "null" shader (handle == 0) + *shader = OpenGL::Shader(); + + std::string picaShaderSource = PICA::ShaderGen::decompileShader( + shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL + ); + + // Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload + // it to the GPU + if (!picaShaderSource.empty()) { + std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader); + shader->create({vertexShaderSource}, OpenGL::Vertex); + } + } + + // Shader generation did not work out, so set usingAcceleratedShader to false + if (!shader->exists()) { + usingAcceleratedShader = false; + } else { + generatedVertexShader = &(*shader); + } + } + if (usingUbershader) { gl.useProgram(triangleProgram); } else { @@ -958,6 +997,8 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); setupUbershaderTexEnv(); } + + return usingAcceleratedShader; } void RendererGL::screenshot(const std::string& name) { @@ -985,22 +1026,12 @@ void RendererGL::screenshot(const std::string& name) { stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0); } -void RendererGL::clearShaderCache() { - for (auto& shader : shaderCache) { - CachedProgram& cachedProgram = shader.second; - cachedProgram.program.free(); - glDeleteBuffers(1, &cachedProgram.uboBinding); - } - - shaderCache.clear(); -} - void RendererGL::deinitGraphicsContext() { // Invalidate all surface caches since they'll no longer be valid textureCache.reset(); depthBufferCache.reset(); colourBufferCache.reset(); - clearShaderCache(); + shaderCache.clear(); // All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext // TODO: Make it so that depth and colour buffers get written back to 3DS memory @@ -1048,4 +1079,4 @@ void RendererGL::initUbershader(OpenGL::Program& program) { glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3); -} +} \ No newline at end of file