diff --git a/CMakeLists.txt b/CMakeLists.txt index b55e2390..ef275edc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ include_directories(third_party/stb) include_directories(third_party/opengl) include_directories(third_party/miniaudio) include_directories(third_party/mio/single_include) +include_directories(third_party/lockfree) add_compile_definitions(NOMINMAX) # Make windows.h not define min/max macros because third-party deps don't like it add_compile_definitions(WIN32_LEAN_AND_MEAN) # Make windows.h not include literally everything @@ -325,14 +326,14 @@ if(ENABLE_OPENGL) set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp - include/renderer_gl/gl_state.hpp + include/renderer_gl/gl_state.hpp include/renderer_gl/async_compiler.hpp ) set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp - src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag - src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert - src/host_shaders/opengl_fragment_shader.frag + src/core/renderer_gl/gl_state.cpp src/core/renderer_gl/async_compiler.cpp + src/host_shaders/opengl_display.frag src/host_shaders/opengl_display.vert + src/host_shaders/opengl_vertex_shader.vert src/host_shaders/opengl_fragment_shader.frag ) set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES}) diff --git a/include/PICA/pica_frag_config.hpp b/include/PICA/pica_frag_config.hpp index 5d5f8420..a253f4d6 100644 --- a/include/PICA/pica_frag_config.hpp +++ b/include/PICA/pica_frag_config.hpp @@ -206,6 +206,24 @@ namespace PICA { return std::memcmp(this, &config, sizeof(FragmentConfig)) == 0; } + FragmentConfig& operator=(const FragmentConfig& config) { + // BitField copy constructor is deleted for reasons, so we have to do this manually + outConfig.raw = config.outConfig.raw; + texConfig = config.texConfig; + fogConfig.raw = config.fogConfig.raw; + lighting.raw = config.lighting.raw; + for (int i = 0; i < 7; i++) { + lighting.luts[i].raw = config.lighting.luts[i].raw; + } + for (int i = 0; i < 8; i++) { + lighting.lights[i].raw = config.lighting.lights[i].raw; + } + + // If this fails you probably added a new field to the struct and forgot to update the copy constructor + static_assert(sizeof(FragmentConfig) == sizeof(outConfig.raw) + sizeof(texConfig) + sizeof(fogConfig.raw) + sizeof(lighting.raw) + 7 * sizeof(LightingLUTConfig) + 8 * sizeof(Light)); + return *this; + } + FragmentConfig(const std::array& regs) : lighting(regs) { auto alphaTestConfig = regs[InternalRegs::AlphaTestConfig]; auto alphaTestFunction = Helpers::getBits<4, 3>(alphaTestConfig); diff --git a/include/renderer_gl/async_compiler.hpp b/include/renderer_gl/async_compiler.hpp new file mode 100644 index 00000000..b2c11c40 --- /dev/null +++ b/include/renderer_gl/async_compiler.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +#include "opengl.hpp" +#include "renderer_gl/renderer_gl.hpp" +#include "PICA/pica_frag_config.hpp" +#include "lockfree/spsc/queue.hpp" + +namespace PICA::ShaderGen +{ + class FragmentGenerator; +} + +namespace AsyncCompiler +{ + void* createContext(void* userdata); + void makeCurrent(void* userdata, void* context); + void destroyContext(void* context); +} + +struct CompilingProgram +{ + CachedProgram* program; + PICA::FragmentConfig* config; +}; + +struct AsyncCompilerThread +{ + explicit AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata); + ~AsyncCompilerThread(); + + // Called from the emulator thread to queue a fragment configuration for compilation + // Returns false if the queue is full, true otherwise + void PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram); + + // Wait for all queued fragment configurations to be compiled + void Finish(); + +private: + PICA::ShaderGen::FragmentGenerator& fragShaderGen; + OpenGL::Shader defaultShadergenVs; + + // Our lockfree queue only allows for trivial types, so we preallocate enough structs + // to avoid dynamic allocation on each push + int preallocatedProgramsIndex; + static constexpr int preallocatedProgramsSize = 256; + std::array preallocatedPrograms; + lockfree::spsc::Queue programQueue; + std::atomic_bool running; + std::atomic_flag hasWork = ATOMIC_FLAG_INIT; + std::thread thread; +}; \ No newline at end of file diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index fa972ecb..53a34935 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -23,6 +23,15 @@ // More circular dependencies! class GPU; +// Cached recompiled fragment shader +struct CachedProgram { + OpenGL::Program program; + std::atomic_bool compiling = false; + bool needsInitialization = true; +}; + +struct AsyncCompilerThread; + class RendererGL final : public Renderer { GLStateManager gl = {}; @@ -72,12 +81,10 @@ class RendererGL final : public Renderer { OpenGL::Shader defaultShadergenVs; GLuint shadergenFragmentUBO; - // Cached recompiled fragment shader - struct CachedProgram { - OpenGL::Program program; - }; std::unordered_map shaderCache; + AsyncCompilerThread* asyncCompiler = nullptr; + OpenGL::Framebuffer getColourFBO(); OpenGL::Texture getTexture(Texture& tex); OpenGL::Program& getSpecializedShader(); @@ -101,7 +108,6 @@ class RendererGL final : public Renderer { void reset() override; void display() override; // Display the 3DS screen contents to the window - void initGraphicsContext(SDL_Window* window) override; // Initialize graphics context void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override; // Clear a GPU buffer in VRAM void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override; // Perform display transfer void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override; @@ -123,7 +129,9 @@ class RendererGL final : public Renderer { void initUbershader(OpenGL::Program& program); #ifdef PANDA3DS_FRONTEND_QT - virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override { initGraphicsContextInternal(); } + virtual void initGraphicsContext(GL::Context* context) override; +#elif defined(PANDA3DS_FRONTEND_SDL) + virtual void initGraphicsContext(SDL_Window* window) override; #endif // Take a screenshot of the screen and store it in a file diff --git a/src/core/renderer_gl/async_compiler.cpp b/src/core/renderer_gl/async_compiler.cpp new file mode 100644 index 00000000..23c33d68 --- /dev/null +++ b/src/core/renderer_gl/async_compiler.cpp @@ -0,0 +1,83 @@ +#include "renderer_gl/async_compiler.hpp" + +AsyncCompilerThread::AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata) + : fragShaderGen(fragShaderGen) +{ + preallocatedProgramsIndex = 0; + running.store(true); + + for (int i = 0; i < preallocatedProgramsSize; i++) + { + preallocatedPrograms[i] = new CompilingProgram(); + preallocatedPrograms[i]->config = new PICA::FragmentConfig({}); + } + + // The context needs to be created on the main thread so that we can make it shared with that + // thread's context + void* context = AsyncCompiler::createContext(userdata); + thread = std::thread([this, userdata, context]() + { + AsyncCompiler::makeCurrent(userdata, context); + printf("Async compiler started, GL version: %s\n", glGetString(GL_VERSION)); + + std::string defaultShadergenVSSource = this->fragShaderGen.getDefaultVertexShader(); + defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex); + + while (running.load()) + { + CompilingProgram* item; + while (programQueue.Pop(item)) { + OpenGL::Program& glProgram = item->program->program; + std::string fs = this->fragShaderGen.generate(*item->config); + OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); + glProgram.create({defaultShadergenVs, fragShader}); + item->program->compiling.store(false); + fragShader.free(); + } + + hasWork.clear(); + std::this_thread::yield(); + } + + AsyncCompiler::destroyContext(context); + }); +} + +AsyncCompilerThread::~AsyncCompilerThread() +{ + running.store(false); + thread.join(); + + for (int i = 0; i < preallocatedProgramsSize; i++) + { + delete preallocatedPrograms[i]->config; + delete preallocatedPrograms[i]; + } +} + +void AsyncCompilerThread::PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram) +{ + CompilingProgram* newProgram = preallocatedPrograms[preallocatedProgramsIndex]; + newProgram->program = cachedProgram; + *newProgram->config = config; + preallocatedProgramsIndex = (preallocatedProgramsIndex + 1) % preallocatedProgramsSize; + bool pushed = programQueue.Push(newProgram); + + if (!pushed) { + Helpers::warn("AsyncCompilerThread: Queue full, spinning"); + } else { + return; + } + + while (!pushed) { + pushed = programQueue.Push(newProgram); + } +} + +void AsyncCompilerThread::Finish() +{ + hasWork.test_and_set(); + + // Wait for the compiler thread to finish any outstanding work + while (hasWork.test_and_set()) {} +} \ No newline at end of file diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 075645a6..1d2accf7 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -9,6 +9,7 @@ #include "PICA/pica_frag_uniforms.hpp" #include "PICA/gpu.hpp" #include "PICA/regs.hpp" +#include "renderer_gl/async_compiler.hpp" #include "math_util.hpp" CMRC_DECLARE(RendererGL); @@ -172,9 +173,18 @@ void RendererGL::initGraphicsContextInternal() { defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex); } -// The OpenGL renderer doesn't need to do anything with the GL context (For Qt frontend) or the SDL window (For SDL frontend) -// So we just call initGraphicsContextInternal for both -void RendererGL::initGraphicsContext([[maybe_unused]] SDL_Window* window) { initGraphicsContextInternal(); } +#ifdef PANDA3DS_FRONTEND_QT + void RendererGL::initGraphicsContext(GL::Context* context) +#elif defined(PANDA3DS_FRONTEND_SDL) + void RendererGL::initGraphicsContext(SDL_Window* context) +#endif +{ + if (shaderMode == ShaderMode::Hybrid) { + asyncCompiler = new AsyncCompilerThread(fragShaderGen, context); + } + + initGraphicsContextInternal(); +} // Set up the OpenGL blending context to match the emulated PICA void RendererGL::setupBlending() { @@ -414,15 +424,46 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::Triangle, }; - bool usingUbershader = shaderMode == ShaderMode::Ubershader; - if (usingUbershader) { - const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; - const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; + bool usingUbershader; + switch (shaderMode) { + case ShaderMode::Ubershader: { + const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; + const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; - // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen - // This way we generate fewer shaders overall than with full shadergen, but don't tank performance - if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { + // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen + // This way we generate fewer shaders overall than with full shadergen, but don't tank performance + if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { + usingUbershader = false; + } else { + usingUbershader = true; + } + break; + } + + case ShaderMode::Specialized: { usingUbershader = false; + break; + } + + case ShaderMode::Hybrid: { + PICA::FragmentConfig fsConfig(regs); // TODO: introduce code duplication to make sure this constructor/lookup isn't done too many times + auto cachedProgram = shaderCache.find(fsConfig); + if (cachedProgram == shaderCache.end()) { + CachedProgram& program = shaderCache[fsConfig]; + program.compiling.store(true); + asyncCompiler->PushFragmentConfig(fsConfig, &program); + usingUbershader = true; + } else if (cachedProgram->second.compiling.load(std::memory_order_relaxed)) { + usingUbershader = true; + } else { + usingUbershader = false; + } + break; + } + + default: { + Helpers::panic("Invalid shader mode"); + break; } } @@ -844,14 +885,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() { OpenGL::Program& program = programEntry.program; if (!program.exists()) { + if (shaderMode == ShaderMode::Hybrid) { + Helpers::panic("Compiling shaders in main thread, this should never happen"); + } + std::string fs = fragShaderGen.generate(fsConfig); OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); program.create({defaultShadergenVs, fragShader}); - gl.useProgram(program); fragShader.free(); + } + if (programEntry.needsInitialization) { + gl.useProgram(program); // Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3 glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0); glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); @@ -862,6 +909,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() { // As it's an OpenGL 4.2 feature that MacOS doesn't support... uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms"); glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding); + programEntry.needsInitialization = false; } glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO); @@ -979,6 +1027,11 @@ void RendererGL::screenshot(const std::string& name) { } void RendererGL::clearShaderCache() { + if (asyncCompiler && shaderMode == ShaderMode::Hybrid) { + // May contain objects that are still in use, so we need to clear them first + asyncCompiler->Finish(); + } + for (auto& shader : shaderCache) { CachedProgram& cachedProgram = shader.second; cachedProgram.program.free(); diff --git a/src/panda_sdl/frontend_sdl.cpp b/src/panda_sdl/frontend_sdl.cpp index 77b1f55f..59d06541 100644 --- a/src/panda_sdl/frontend_sdl.cpp +++ b/src/panda_sdl/frontend_sdl.cpp @@ -35,6 +35,7 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, config.rendererType == RendererType::Software ? 3 : 4); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, config.rendererType == RendererType::Software ? 3 : 1); + SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1); window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 400, 480, SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE); if (window == nullptr) { @@ -342,3 +343,38 @@ void FrontendSDL::run() { SDL_GL_SwapWindow(window); } } + +namespace AsyncCompiler { + void* createContext(void* window) { + SDL_Window* sdlWindow = static_cast(window); + + // SDL_GL_CreateContext also makes it the current context so we need to switch back after creation + SDL_GLContext currentContext = SDL_GL_GetCurrentContext(); + + SDL_GLContext glContext = SDL_GL_CreateContext(sdlWindow); + + if (glContext == nullptr) { + Helpers::panic("OpenGL context creation failed: %s", SDL_GetError()); + } + + // As per the wiki you should check the value after creating the context + // as it can differ from the requested value + int sharingEnabled; + SDL_GL_GetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, &sharingEnabled); + if (!sharingEnabled) { + Helpers::panic("OpenGL context sharing not enabled"); + } + + SDL_GL_MakeCurrent(sdlWindow, currentContext); + + return glContext; + } + + void makeCurrent(void* window, void* context) { + SDL_GL_MakeCurrent((SDL_Window*)window, (SDL_GLContext)context); + } + + void destroyContext(void* context) { + SDL_GL_DeleteContext(static_cast(context)); + } +} \ No newline at end of file