Actually implement the damn thing

2025-07-12 18:28:30 +12:00 · 2024-08-08 16:39:59 +03:00 · 2024-08-08 16:39:59 +03:00 · 67069a8826
commit 67069a8826
parent c396b3f225
7 changed files with 274 additions and 21 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -79,6 +79,7 @@ include_directories(third_party/stb)
 include_directories(third_party/opengl)
 include_directories(third_party/miniaudio)
 include_directories(third_party/mio/single_include)
+include_directories(third_party/lockfree)

 add_compile_definitions(NOMINMAX)             # Make windows.h not define min/max macros because third-party deps don't like it
 add_compile_definitions(WIN32_LEAN_AND_MEAN)  # Make windows.h not include literally everything
@ -325,14 +326,14 @@ if(ENABLE_OPENGL)
    set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
        include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
        include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/async_compiler.hpp
    )

    set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
        src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp
-        src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag
-        src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert
-        src/host_shaders/opengl_fragment_shader.frag
+        src/core/renderer_gl/gl_state.cpp src/core/renderer_gl/async_compiler.cpp
+        src/host_shaders/opengl_display.frag src/host_shaders/opengl_display.vert
+        src/host_shaders/opengl_vertex_shader.vert src/host_shaders/opengl_fragment_shader.frag
    )

    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
--- a/include/PICA/pica_frag_config.hpp
+++ b/include/PICA/pica_frag_config.hpp
@ -206,6 +206,24 @@ namespace PICA {
 			return std::memcmp(this, &config, sizeof(FragmentConfig)) == 0;
 		}

+		FragmentConfig& operator=(const FragmentConfig& config) {
+			// BitField copy constructor is deleted for reasons, so we have to do this manually
+			outConfig.raw = config.outConfig.raw;
+			texConfig = config.texConfig;
+			fogConfig.raw = config.fogConfig.raw;
+			lighting.raw = config.lighting.raw;
+			for (int i = 0; i < 7; i++) {
+				lighting.luts[i].raw = config.lighting.luts[i].raw;
+			}
+			for (int i = 0; i < 8; i++) {
+				lighting.lights[i].raw = config.lighting.lights[i].raw;
+			}
+
+			// If this fails you probably added a new field to the struct and forgot to update the copy constructor
+			static_assert(sizeof(FragmentConfig) == sizeof(outConfig.raw) + sizeof(texConfig) + sizeof(fogConfig.raw) + sizeof(lighting.raw) + 7 * sizeof(LightingLUTConfig) + 8 * sizeof(Light));
+			return *this;
+		}
+
 		FragmentConfig(const std::array<u32, 0x300>& regs) : lighting(regs) {
 			auto alphaTestConfig = regs[InternalRegs::AlphaTestConfig];
 			auto alphaTestFunction = Helpers::getBits<4, 3>(alphaTestConfig);
--- a/include/renderer_gl/async_compiler.hpp
+++ b/include/renderer_gl/async_compiler.hpp
@ -0,0 +1,54 @@
+#pragma once
+
+#include <atomic>
+#include <thread>
+
+#include "opengl.hpp"
+#include "renderer_gl/renderer_gl.hpp"
+#include "PICA/pica_frag_config.hpp"
+#include "lockfree/spsc/queue.hpp"
+
+namespace PICA::ShaderGen
+{
+    class FragmentGenerator;
+}
+
+namespace AsyncCompiler
+{
+    void* createContext(void* userdata);
+    void makeCurrent(void* userdata, void* context);
+    void destroyContext(void* context);
+}
+
+struct CompilingProgram
+{
+    CachedProgram* program;
+    PICA::FragmentConfig* config;
+};
+
+struct AsyncCompilerThread
+{
+    explicit AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata);
+    ~AsyncCompilerThread();
+
+    // Called from the emulator thread to queue a fragment configuration for compilation
+    // Returns false if the queue is full, true otherwise
+    void PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram);
+
+    // Wait for all queued fragment configurations to be compiled
+    void Finish();
+
+private:
+    PICA::ShaderGen::FragmentGenerator& fragShaderGen;
+    OpenGL::Shader defaultShadergenVs;
+
+    // Our lockfree queue only allows for trivial types, so we preallocate enough structs
+    // to avoid dynamic allocation on each push
+    int preallocatedProgramsIndex;
+    static constexpr int preallocatedProgramsSize = 256;
+    std::array<CompilingProgram*, preallocatedProgramsSize> preallocatedPrograms;
+    lockfree::spsc::Queue<CompilingProgram*, preallocatedProgramsSize - 1> programQueue;
+    std::atomic_bool running;
+    std::atomic_flag hasWork = ATOMIC_FLAG_INIT;
+    std::thread thread;
+};
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -23,6 +23,15 @@
 // More circular dependencies!
 class GPU;

+// Cached recompiled fragment shader
+struct CachedProgram {
+	OpenGL::Program program;
+	std::atomic_bool compiling = false;
+	bool needsInitialization = true;
+};
+
+struct AsyncCompilerThread;
+
 class RendererGL final : public Renderer {
 	GLStateManager gl = {};

@ -72,12 +81,10 @@ class RendererGL final : public Renderer {
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;

-	// Cached recompiled fragment shader
-	struct CachedProgram {
-		OpenGL::Program program;
-	};
 	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;

+	AsyncCompilerThread* asyncCompiler = nullptr;
+
 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
 	OpenGL::Program& getSpecializedShader();
@ -101,7 +108,6 @@ class RendererGL final : public Renderer {

 	void reset() override;
 	void display() override;                                                              // Display the 3DS screen contents to the window
-	void initGraphicsContext(SDL_Window* window) override;                                // Initialize graphics context
 	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;  // Clear a GPU buffer in VRAM
 	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;  // Perform display transfer
 	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
@ -123,7 +129,9 @@ class RendererGL final : public Renderer {
 	void initUbershader(OpenGL::Program& program);

 #ifdef PANDA3DS_FRONTEND_QT
-	virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override { initGraphicsContextInternal(); }
+	virtual void initGraphicsContext(GL::Context* context) override;
+#elif defined(PANDA3DS_FRONTEND_SDL)
+	virtual void initGraphicsContext(SDL_Window* window) override;
 #endif

 	// Take a screenshot of the screen and store it in a file
--- a/src/core/renderer_gl/async_compiler.cpp
+++ b/src/core/renderer_gl/async_compiler.cpp
@ -0,0 +1,83 @@
+#include "renderer_gl/async_compiler.hpp"
+
+AsyncCompilerThread::AsyncCompilerThread(PICA::ShaderGen::FragmentGenerator& fragShaderGen, void* userdata)
+    : fragShaderGen(fragShaderGen)
+{
+    preallocatedProgramsIndex = 0;
+    running.store(true);
+
+    for (int i = 0; i < preallocatedProgramsSize; i++)
+    {
+        preallocatedPrograms[i] = new CompilingProgram();
+        preallocatedPrograms[i]->config = new PICA::FragmentConfig({});
+    }
+
+    // The context needs to be created on the main thread so that we can make it shared with that
+    // thread's context
+    void* context = AsyncCompiler::createContext(userdata);
+    thread = std::thread([this, userdata, context]()
+    {
+        AsyncCompiler::makeCurrent(userdata, context);
+        printf("Async compiler started, GL version: %s\n", glGetString(GL_VERSION));
+
+        std::string defaultShadergenVSSource = this->fragShaderGen.getDefaultVertexShader();
+        defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
+
+        while (running.load())
+        {
+            CompilingProgram* item;
+            while (programQueue.Pop(item)) {
+                OpenGL::Program& glProgram = item->program->program;
+                std::string fs = this->fragShaderGen.generate(*item->config);
+                OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
+		        glProgram.create({defaultShadergenVs, fragShader});
+                item->program->compiling.store(false);
+                fragShader.free();
+            }
+            
+            hasWork.clear();
+            std::this_thread::yield();
+        }
+
+        AsyncCompiler::destroyContext(context);
+    });
+}
+
+AsyncCompilerThread::~AsyncCompilerThread()
+{
+    running.store(false);
+    thread.join();
+
+    for (int i = 0; i < preallocatedProgramsSize; i++)
+    {
+        delete preallocatedPrograms[i]->config;
+        delete preallocatedPrograms[i];
+    }
+}
+
+void AsyncCompilerThread::PushFragmentConfig(const PICA::FragmentConfig& config, CachedProgram* cachedProgram)
+{
+    CompilingProgram* newProgram = preallocatedPrograms[preallocatedProgramsIndex];
+    newProgram->program = cachedProgram;
+    *newProgram->config = config;
+    preallocatedProgramsIndex = (preallocatedProgramsIndex + 1) % preallocatedProgramsSize;
+    bool pushed = programQueue.Push(newProgram);
+
+    if (!pushed) {
+        Helpers::warn("AsyncCompilerThread: Queue full, spinning");
+    } else {
+        return;
+    }
+
+    while (!pushed) {
+        pushed = programQueue.Push(newProgram);
+    }
+}
+
+void AsyncCompilerThread::Finish()
+{
+    hasWork.test_and_set();
+
+    // Wait for the compiler thread to finish any outstanding work
+    while (hasWork.test_and_set()) {}
+}
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -9,6 +9,7 @@
 #include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
+#include "renderer_gl/async_compiler.hpp"
 #include "math_util.hpp"

 CMRC_DECLARE(RendererGL);
@ -172,9 +173,18 @@ void RendererGL::initGraphicsContextInternal() {
 	defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
 }

-// The OpenGL renderer doesn't need to do anything with the GL context (For Qt frontend) or the SDL window (For SDL frontend)
-// So we just call initGraphicsContextInternal for both
-void RendererGL::initGraphicsContext([[maybe_unused]] SDL_Window* window) { initGraphicsContextInternal(); }
+#ifdef PANDA3DS_FRONTEND_QT
+	void RendererGL::initGraphicsContext(GL::Context* context)
+#elif defined(PANDA3DS_FRONTEND_SDL)
+	void RendererGL::initGraphicsContext(SDL_Window* context)
+#endif
+{
+	if (shaderMode == ShaderMode::Hybrid) {
+		asyncCompiler = new AsyncCompilerThread(fragShaderGen, context);
+	}
+
+	initGraphicsContextInternal();
+}

 // Set up the OpenGL blending context to match the emulated PICA
 void RendererGL::setupBlending() {
@ -414,15 +424,46 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};

-	bool usingUbershader = shaderMode == ShaderMode::Ubershader;
-	if (usingUbershader) {
-		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
-		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
+	bool usingUbershader;
+	switch (shaderMode) {
+		case ShaderMode::Ubershader: {
+			const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
+			const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;

-		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
-		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
-		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+			// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
+			// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
+			if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+				usingUbershader = false;
+			} else {
+				usingUbershader = true;
+			}
+			break;
+		}
+
+		case ShaderMode::Specialized: {
 			usingUbershader = false;
+			break;
+		}
+
+		case ShaderMode::Hybrid: {
+			PICA::FragmentConfig fsConfig(regs); // TODO: introduce code duplication to make sure this constructor/lookup isn't done too many times
+			auto cachedProgram = shaderCache.find(fsConfig);
+			if (cachedProgram == shaderCache.end()) {
+				CachedProgram& program = shaderCache[fsConfig];
+				program.compiling.store(true);
+				asyncCompiler->PushFragmentConfig(fsConfig, &program);
+				usingUbershader = true;
+			} else if (cachedProgram->second.compiling.load(std::memory_order_relaxed)) {
+				usingUbershader = true;
+			} else {
+				usingUbershader = false;
+			}
+			break;
+		}
+
+		default: {
+			Helpers::panic("Invalid shader mode");
+			break;
 		}
 	}
 		
@ -844,14 +885,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	OpenGL::Program& program = programEntry.program;

 	if (!program.exists()) {
+		if (shaderMode == ShaderMode::Hybrid) {
+			Helpers::panic("Compiling shaders in main thread, this should never happen");
+		}
+
 		std::string fs = fragShaderGen.generate(fsConfig);

 		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
 		program.create({defaultShadergenVs, fragShader});
-		gl.useProgram(program);

 		fragShader.free();
+	}

+	if (programEntry.needsInitialization) {
+		gl.useProgram(program);
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
@ -862,6 +909,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
 		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
 		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		programEntry.needsInitialization = false;
 	}
 	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);

@ -979,6 +1027,11 @@ void RendererGL::screenshot(const std::string& name) {
 }

 void RendererGL::clearShaderCache() {
+	if (asyncCompiler && shaderMode == ShaderMode::Hybrid) {
+		// May contain objects that are still in use, so we need to clear them first
+		asyncCompiler->Finish();
+	}
+
 	for (auto& shader : shaderCache) {
 		CachedProgram& cachedProgram = shader.second;
 		cachedProgram.program.free();
--- a/src/panda_sdl/frontend_sdl.cpp
+++ b/src/panda_sdl/frontend_sdl.cpp
@ -35,6 +35,7 @@ FrontendSDL::FrontendSDL() : keyboardMappings(InputMappings::defaultKeyboardMapp
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, config.rendererType == RendererType::Software ? 3 : 4);
 		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, config.rendererType == RendererType::Software ? 3 : 1);
+		SDL_GL_SetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, 1);
 		window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 400, 480, SDL_WINDOW_OPENGL | SDL_WINDOW_RESIZABLE);

 		if (window == nullptr) {
@ -342,3 +343,38 @@ void FrontendSDL::run() {
 		SDL_GL_SwapWindow(window);
 	}
 }
+
+namespace AsyncCompiler {
+	void* createContext(void* window) {
+		SDL_Window* sdlWindow = static_cast<SDL_Window*>(window);
+
+		// SDL_GL_CreateContext also makes it the current context so we need to switch back after creation
+		SDL_GLContext currentContext = SDL_GL_GetCurrentContext();
+
+		SDL_GLContext glContext = SDL_GL_CreateContext(sdlWindow);
+
+		if (glContext == nullptr) {
+			Helpers::panic("OpenGL context creation failed: %s", SDL_GetError());
+		}
+
+		// As per the wiki you should check the value after creating the context
+		// as it can differ from the requested value
+		int sharingEnabled;
+		SDL_GL_GetAttribute(SDL_GL_SHARE_WITH_CURRENT_CONTEXT, &sharingEnabled);
+		if (!sharingEnabled) {
+			Helpers::panic("OpenGL context sharing not enabled");
+		}
+
+		SDL_GL_MakeCurrent(sdlWindow, currentContext);
+
+		return glContext;
+	}
+
+	void makeCurrent(void* window, void* context) {
+		SDL_GL_MakeCurrent((SDL_Window*)window, (SDL_GLContext)context);
+	}
+
+	void destroyContext(void* context) {
+		SDL_GL_DeleteContext(static_cast<SDL_GLContext>(context));
+	}
+}