A kissable commit

Minor fixes
2025-07-01 04:46:20 +12:00 · 2024-07-19 18:49:21 +03:00 · 2024-07-19 18:49:21 +03:00 · 78ac8d2c0d
commit 78ac8d2c0d
parent 1b779cafa1
10 changed files with 649 additions and 27 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -74,6 +74,7 @@ include_directories(third_party/stb)
 include_directories(third_party/opengl)
 include_directories(third_party/miniaudio)
 include_directories(third_party/mio/single_include)
+include_directories(third_party/lockfree)

 add_compile_definitions(NOMINMAX)             # Make windows.h not define min/max macros because third-party deps don't like it
 add_compile_definitions(WIN32_LEAN_AND_MEAN)  # Make windows.h not include literally everything
@ -320,14 +321,14 @@ if(ENABLE_OPENGL)
    set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
        include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
        include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/async_compiler.hpp
    )

    set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
        src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp
-        src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag
-        src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert
-        src/host_shaders/opengl_fragment_shader.frag
+        src/core/renderer_gl/gl_state.cpp src/core/renderer_gl/async_compiler.cpp
+        src/host_shaders/opengl_display.frag src/host_shaders/opengl_display.vert
+        src/host_shaders/opengl_vertex_shader.vert src/host_shaders/opengl_fragment_shader.frag
    )

    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
--- a/docs/3ds/async_shader_compilation.md
+++ b/docs/3ds/async_shader_compilation.md
@ -0,0 +1,32 @@
+Async shader compilation should hide the problem of compilation stutter by using the ubershader when
+specialized shaders are being compiled on a separate thread. To activate this mode, set shaderMode to hybrid
+in config.toml
+
+The way it works is the following:
+
+A shaderCompilationThread is started, which holds its own separate OpenGL context.
+The communication happens with two lock-free single consumer single producer queues, to avoid messing with mutexes and potential
+deadlocks.
+
+The configQueue, a queue that contains PICA::FragmentConfig objects. These objects dictate how a specialized shader should
+be compiled.
+
+The compiledQueue, a queue that contains CompiledShader objects which contain the compiled program.
+
+When drawVertices happens, if shaderMode is set to Hybrid, this is what happens:
+
+Emulator thread checks if the shader already exists in shaderCache. Only the emulator thread accesses the shaderCache, so no locks are needed.
+
+If the shader exists and is ready (CachedProgram::ready == true) it uses it like normal.
+
+If the shader exists and isn't ready, that means the shader compilation was queued but the compiler thread hasn't finished yet and sets useUbershader to true.
+
+If the shader doesn't exist, it pushes to configQueue the current PICA::FragmentConfig (by copying it over) and sets useUbershader to true.
+
+Then, at some point, the compilation thread will detect that there's configurations it needs to compile in configQueue.
+The compilation thread will compile the shader and then get its binary with glGetProgramBinary. Then it will push this binary to compiledQueue.
+
+On the other end, the emulator on every drawVertices will check if there's stuff in compiledQueue. If there is, it will use
+glProgramBinary to convert the raw binary data into the program in shaderCache and set the CachedProgram::ready to true
+
+Then, if the same PICA::FragmentConfig is needed again, it will be available in shaderCache.
--- a/include/renderer_gl/async_compiler.hpp
+++ b/include/renderer_gl/async_compiler.hpp
@ -0,0 +1,75 @@
+// For asynchronous shader compilation (hybrid mode)
+// See docs/3ds/async_shader_compilation.md for more info
+#pragma once
+
+#include <atomic>
+#include <vector>
+#include <thread>
+
+#include "helpers.hpp"
+#include "opengl.hpp"
+#include "PICA/pica_frag_config.hpp"
+#include "spsc/queue.hpp"
+
+// We make the assumption that not more than 256 shaders will be queued at once.
+// This may seem like a hack, but it's a reasonable assumption and it allows us to not have
+// to implement a waiting mechanism (wait until the queue is not full)
+// In the unlikely event that either queue is full, a panic will be triggered.
+constexpr size_t maxInFlightShaderCount = 256;
+
+// A compiled program, ready to be passed to glProgramBinary
+struct CompiledProgram
+{
+    CompiledProgram(const PICA::FragmentConfig& fsConfig)
+        : fsConfig(fsConfig)
+    {}
+
+    PICA::FragmentConfig fsConfig;
+	std::vector<u8> binary;
+	u32 binaryFormat;
+};
+
+namespace PICA
+{
+    namespace ShaderGen
+    {
+        class FragmentGenerator;
+    }
+};
+
+struct AsyncCompilerState
+{
+    // The constructor will start the thread that will compile the shaders and create an OpenGL context
+    explicit AsyncCompilerState(PICA::ShaderGen::FragmentGenerator& fragShaderGen);
+
+    // The destructor will first set the stop flag and join the thread (which will wait until it exits)
+    // and then clean up the queues
+    ~AsyncCompilerState();
+
+    // Called from the emulator thread to queue a fragment configuration for compilation
+    // Returns false if the queue is full, true otherwise
+    bool PushFragmentConfig(const PICA::FragmentConfig& config);
+
+    // Called from the emulator thread to get a compiled program
+    // Returns true if a compiled program is available, false otherwise
+    bool PopCompiledProgram(CompiledProgram*& program);
+
+    // Manually starts the thread
+    void Start();
+
+    // Manually stops the thread
+    void Stop();
+private:
+    void createGLContext();
+    void destroyGLContext();
+
+    PICA::ShaderGen::FragmentGenerator& fragShaderGen;
+
+    OpenGL::Shader defaultShadergenVs;
+
+    // Pointers are used in these queues because the lock-free queue require trivial types
+	lockfree::spsc::Queue<PICA::FragmentConfig*, maxInFlightShaderCount> configQueue;
+	lockfree::spsc::Queue<CompiledProgram*, maxInFlightShaderCount> compiledQueue;
+	std::atomic_bool running = false;
+	std::thread shaderCompilationThread;
+};
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -6,6 +6,7 @@
 #include <span>
 #include <unordered_map>

+#include "async_compiler.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/pica_hash.hpp"
@ -22,6 +23,13 @@
 // More circular dependencies!
 class GPU;

+// Cached recompiled fragment shader
+struct CachedProgram {
+	OpenGL::Program program;
+	uint uboBinding;
+	bool ready = false;
+};
+
 class RendererGL final : public Renderer {
 	GLStateManager gl = {};

@ -76,10 +84,14 @@ class RendererGL final : public Renderer {
 		OpenGL::Program program;
 	};
 	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
+	std::unique_ptr<AsyncCompilerState> asyncCompiler;
+
+	void startCompilationThread();
+	void stopCompilationThread();

 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
-	OpenGL::Program& getSpecializedShader();
+	OpenGL::Program& getSpecializedShader(const PICA::FragmentConfig& fsConfig);

 	PICA::ShaderGen::FragmentGenerator fragShaderGen;

--- a/src/core/renderer_gl/async_compiler.cpp
+++ b/src/core/renderer_gl/async_compiler.cpp
@ -0,0 +1,114 @@
+#include "renderer_gl/async_compiler.hpp"
+#include "PICA/pica_frag_config.hpp"
+#include "PICA/shader_gen.hpp"
+#include "glad/gl.h"
+#include "opengl.hpp"
+
+AsyncCompilerState::AsyncCompilerState(PICA::ShaderGen::FragmentGenerator& fragShaderGenRef)
+    : fragShaderGen(fragShaderGenRef)
+{
+    Start();
+}
+
+AsyncCompilerState::~AsyncCompilerState()
+{
+    Stop();
+}
+
+bool AsyncCompilerState::PushFragmentConfig(const PICA::FragmentConfig& config)
+{
+    PICA::FragmentConfig* newConfig = new PICA::FragmentConfig(config);
+    bool pushedSuccessfully = configQueue.Push(newConfig);
+
+    if (!pushedSuccessfully) {
+        Helpers::panic("Hlep we managed to fill the shader queue");
+    }
+
+    return pushedSuccessfully;
+}
+
+bool AsyncCompilerState::PopCompiledProgram(CompiledProgram*& program)
+{
+    bool hasItem = compiledQueue.Pop(program);
+    return hasItem;
+}
+
+void AsyncCompilerState::createGLContext() {
+    // TODO: do me
+}
+
+void AsyncCompilerState::Start() {
+    shaderCompilationThread = std::thread([this]() {
+        createGLContext();
+
+        std::string defaultShadergenVSSource = fragShaderGen.getDefaultVertexShader();
+	    defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
+        
+        running = true;
+
+        while (running.load(std::memory_order_relaxed)) {
+            PICA::FragmentConfig* fsConfig;
+            if (configQueue.Pop(fsConfig)) {
+                OpenGL::Program glProgram;
+                std::string fs = fragShaderGen.generate(*fsConfig);
+                OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
+		        glProgram.create({defaultShadergenVs, fragShader});
+
+                CompiledProgram* program = new CompiledProgram(*fsConfig);
+                GLint size;
+                glGetProgramiv(glProgram.handle(), GL_PROGRAM_BINARY_LENGTH, &size);
+
+                if (size == 0) {
+                    Helpers::panic("Failed to get program binary size");
+                }
+
+                program->binary.resize(size);
+
+                GLint bytesWritten;
+                glGetProgramBinary(glProgram.handle(), size, &bytesWritten, &program->binaryFormat, program->binary.data());
+
+                if (bytesWritten != size || bytesWritten == 0) {
+                    Helpers::panic("Failed to get program binary");
+                }
+                
+                delete fsConfig;
+                
+                bool pushedSuccessfully = compiledQueue.Push(program);
+                if (!pushedSuccessfully) {
+                    Helpers::panic("Hlep we managed to fill the shader queue");
+                }
+            }
+
+            // Sleep for a bit to avoid excessive CPU usage
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+
+        destroyGLContext();
+    });
+}
+
+void AsyncCompilerState::Stop() {
+    running = false;
+    shaderCompilationThread.join();
+
+    bool hasItem = false;
+    do
+    {
+        CompiledProgram* program;
+        hasItem = compiledQueue.Pop(program);
+        if (hasItem)
+        {
+            delete program;
+        }
+    } while (hasItem);
+
+    do
+    {
+        PICA::FragmentConfig* config;
+        hasItem = configQueue.Pop(config);
+        if (hasItem)
+        {
+            delete config;
+        }
+    } while (hasItem);
+}
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -6,17 +6,46 @@

 #include "config.hpp"
 #include "PICA/float_types.hpp"
+#include "PICA/pica_frag_config.hpp"
 #include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
 #include "math_util.hpp"

+#include "opengl.hpp"
+#include "renderer_gl/async_compiler.hpp"
+#include "renderer_gl/gl_state.hpp"
+
 CMRC_DECLARE(RendererGL);

 using namespace Floats;
 using namespace Helpers;
 using namespace PICA;

+namespace {
+	constexpr uint uboBlockBinding = 2;
+
+	void initializeProgramEntry(GLStateManager& gl, CachedProgram& programEntry) {
+		OpenGL::Program& program = programEntry.program;
+
+		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
+		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
+		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
+		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
+		glUniform1i(OpenGL::uniformLocation(program, "u_tex_lighting_lut"), 3);
+
+		// Allocate memory for the program UBO
+		glGenBuffers(1, &programEntry.uboBinding);
+		gl.bindUBO(programEntry.uboBinding);
+		glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);
+
+		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
+		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
+		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
+		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+	}
+}
+
 RendererGL::~RendererGL() {}

 void RendererGL::reset() {
@ -170,6 +199,12 @@ void RendererGL::initGraphicsContextInternal() {
 	// Initialize the default vertex shader used with shadergen
 	std::string defaultShadergenVSSource = fragShaderGen.getDefaultVertexShader();
 	defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
+
+	if (shaderMode == ShaderMode::Hybrid && !asyncCompiler)
+	{
+		// This will create and start the async compiler thread
+		asyncCompiler = std::make_unique<AsyncCompilerState>(fragShaderGen);
+	}
 }

 // The OpenGL renderer doesn't need to do anything with the GL context (For Qt frontend) or the SDL window (For SDL frontend)
@ -426,13 +461,52 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
 			usingUbershader = false;
 		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
 	} else {
-		OpenGL::Program& program = getSpecializedShader();
-		gl.useProgram(program);
+		PICA::FragmentConfig fsConfig(regs);
+
+		// If shaderMode is Specialized, shaderCompiled is set to true which means getSpecializedShader will
+		// compile the shader for us if it's not compiled yet
+		bool shaderCompiled = true;
+		
+		if (shaderMode == ShaderMode::Hybrid) {
+			CompiledProgram* compiledProgram;
+
+			// Pop all the queued compiled programs so they can be added to the shader cache
+			while (asyncCompiler->PopCompiledProgram(compiledProgram)) {
+				CachedProgram& programEntry = shaderCache[compiledProgram->fsConfig];
+				programEntry.ready = true;
+
+				glProgramBinary(programEntry.program.handle(), compiledProgram->binaryFormat, compiledProgram->binary.data(), compiledProgram->binary.size());
+				initializeProgramEntry(gl, programEntry);
+
+				delete compiledProgram;
+			}
+
+			bool contains = shaderCache.contains(fsConfig);
+			shaderCompiled = contains && shaderCache[fsConfig].ready;
+
+			if (!shaderCompiled) {
+				gl.useProgram(triangleProgram);
+
+				if (!contains) {
+					// Adds an empty shader to the shader cache and sets it to false
+					// This will prevent queueing the same shader multiple times
+					shaderCache[fsConfig].ready = false;
+					asyncCompiler->PushFragmentConfig(fsConfig);
+				}
+			}
+		}
+
+		if (shaderCompiled) {
+			OpenGL::Program& program = getSpecializedShader(fsConfig);
+			gl.useProgram(program);
+		} else {
+			useUbershader = true;
+		}
+	}
+
+	if (useUbershader) {
+		gl.useProgram(triangleProgram);
 	}

 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
@ -460,7 +534,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};

 	// Update ubershader uniforms
-	if (usingUbershader) {
+	if (useUbershader) {
 		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
 		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
 		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
@ -837,15 +911,16 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
 	return colourBufferCache.add(sampleBuffer);
 }

-OpenGL::Program& RendererGL::getSpecializedShader() {
-	constexpr uint uboBlockBinding = 2;
-
-	PICA::FragmentConfig fsConfig(regs);
-
+OpenGL::Program& RendererGL::getSpecializedShader(const PICA::FragmentConfig& fsConfig) {
 	CachedProgram& programEntry = shaderCache[fsConfig];
 	OpenGL::Program& program = programEntry.program;

 	if (!program.exists()) {
+		if (shaderMode == ShaderMode::Hybrid) {
+			// If the shader mode is hybrid, we shouldn't reach this point
+			Helpers::panic("Trying to compile specialized shader from main thread in hybrid mode");
+		}
+
 		std::string fs = fragShaderGen.generate(fsConfig);

 		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
@ -854,16 +929,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() {

 		fragShader.free();

-		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
-		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
-		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
-		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
-		glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
-
-		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
-		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
-		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
-		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		initializeProgramEntry(gl, programEntry);
 	}
 	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);

@ -995,6 +1061,7 @@ void RendererGL::deinitGraphicsContext() {
 	depthBufferCache.reset();
 	colourBufferCache.reset();
 	clearShaderCache();
+	if (asyncCompiler) asyncCompiler->Stop();

 	// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
 	// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@ -1023,6 +1090,10 @@ void RendererGL::setUbershader(const std::string& shader) {
 	glUniform1i(ubershaderData.depthmapEnableLoc, oldDepthmapEnable);
 }

+void RendererGL::setShaderMode(ShaderMode mode) {
+	shaderMode = mode;
+}
+
 void RendererGL::initUbershader(OpenGL::Program& program) {
 	gl.useProgram(program);

--- a/third_party/lockfree/LICENSE
+++ b/third_party/lockfree/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Djordje Nedic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/third_party/lockfree/README.md
+++ b/third_party/lockfree/README.md
@ -0,0 +1,75 @@
+# lockfree
+![CMake](https://github.com/DNedic/lockfree/actions/workflows/.github/workflows/cmake.yml/badge.svg)
+
+`lockfree` is a collection of lock-free data structures written in standard C++11 and suitable for all platforms - from deeply embedded to HPC.
+
+## What are lock-free data structures?
+
+Lock-free data structures are data structures that are thread and interrupt safe for concurrent use without having to use mutual exclusion mechanisms. They are most useful for inter process communication, and often scale much better than lock-based structures with the number of operations and threads.
+
+## Why use `lockfree`
+* Written in standard C++11, compatible with all platforms supporting it
+* All data structures are thread and interrupt safe in their respective usecases
+* No dynamic allocation
+* Optimized for high performance
+* MIT Licensed
+* Additional APIs for newer C++ versions
+
+## What data structures are available?
+### Single-producer single-consumer data structures
+* [Queue](docs/spsc/queue.md) - Best for single element operations, extremely fast, simple API consisting of only 2 methods.
+* [Ring Buffer](docs/spsc/ring_buf.md) - A more general data structure with the ability to handle multiple elements at a time, uses standard library copies making it very fast for bulk operations.
+* [Bipartite Buffer](docs/spsc/bipartite_buf.md) - A variation of the ring buffer with the ability to always provide linear space in the buffer, enables in-buffer processing.
+* [Priority Queue](docs/spsc/priority_queue.md) - A Variation of the queue with the ability to provide different priorities for elements, very useful for things like signals, events and communication packets.
+
+These data structures are more performant and should generally be used whenever there is only one thread/interrupt pushing data and another one retrieving it.
+
+### Multi-producer multi-consumer data structures
+* [Queue](docs/mpmc/queue.md) - Best for single element operations, extremely fast, simple API consisting of only 2 methods.
+* [Priority Queue](docs/mpmc/priority_queue.md) - A Variation of the queue with the ability to provide different priorities for elements, very useful for things like signals, events and communication packets.
+
+These data structures are more general, supporting multiple producers and consumers at the same time, however they have storage and performance overhead compared to single producer single consumer data structures. They also require atomic instructions which can be missing from some low-end microcontrollers.
+
+## How to get
+There are three main ways to get the library:
+* Using CMake [FetchContent()](https://cmake.org/cmake/help/latest/module/FetchContent.html)
+* As a [git submodule](https://git-scm.com/book/en/v2/Git-Tools-Submodules)
+* By downloading a release from GitHub
+
+## Configuration
+`lockfree` uses cacheline alignment for indexes to avoid the [False Sharing](https://en.wikipedia.org/wiki/False_sharing) phenomenon by default, avoiding the performance loss of cacheline invalidation  on cache coherent systems.  This aligns the indexes to ```LOCKFREE_CACHELINE_LENGTH```, ```64``` by default.
+
+On embedded systems, ```LOCKFREE_CACHE_COHERENT``` should almost always be set as ```false``` to avoid wasting memory.
+
+Additionally, some systems have a non-typical cacheline length (for instance the apple M1/M2 CPUs have a cacheline length of 128 bytes), and ```LOCKFREE_CACHELINE_LENGTH``` should be set accordingly in those cases.
+
+## Known limitations
+All of the data structures in `lockfree` are only meant to be used for [trivial](https://en.cppreference.com/w/cpp/language/classes#Trivial_class) types.
+
+## FAQ
+### Why would I use this over locking data structures on a hosted machine?
+
+The biggest reason you would want to use a lock-free data structure on hosted environments would be avoiding issues surrounding locking such as deadlocks, priority inversion and nondeterministic access latency. When used properly, lock-free data structures can also improve performance in some scenarios.
+
+Additionally, `lockfree` provides a way to build applications and libraries that can be compiled to work on both POSIX and non-POSIX environments without `#ifdef`s or polymorphism.
+
+### Why use this over RTOS-provided IPC mechanisms on an embedded system?
+
+While locking usually isn't expensive on embedded systems such as microcontrollers, there is a wide variety of RTOS-es and no standardized API for locking. The fact that multiple architectures are present from 8051 to RISC-V means that interrupt management methods are not standardized either.
+
+`lockfree` provides a way to build portable embedded code with a negligible performance cost as opposed to locking, code using `lockfree` can be compiled to run on any embedded platform supporting C++11. Additionally, the code can easily be tested on a host machine without the need for mocking.
+
+### What advantages does using C++ over C provide for the library?
+* Type safety, as data structures are type and size templated
+* Much simpler and less error-prone instantiation
+* Higher performance due to compile-time known size and header-only implementation
+* Encapsulation, the data buffer is a class member instead of being passed by a pointer
+
+### What is the formal classification of the data structures in `lockfree`?
+All structures in `lockfree` are **bounded**, **array-based** and **lock-free**, spsc data structures are also **waitfree** and **termination safe**. 
+
+## Theory and references
+For more insight into lock-free programming, take a look at:
+* This [brilliant talk series](https://youtu.be/c1gO9aB9nbs) from Herb Sutter
+* [Live Lock-Free or Deadlock](https://youtu.be/lVBvHbJsg5Y) talk series from Fedor Pikus
+* Dmitry Vyukov's excellent [blog](https://www.1024cores.net/home/lock-free-algorithms/introduction)
--- a/third_party/lockfree/spsc/queue.hpp
+++ b/third_party/lockfree/spsc/queue.hpp
@ -0,0 +1,110 @@
+/**************************************************************
+ * @file queue.hpp
+ * @brief A queue implementation written in standard c++11
+ * suitable for both low-end microcontrollers all the way
+ * to HPC machines. Lock-free for single consumer single
+ * producer scenarios.
+ **************************************************************/
+
+/**************************************************************
+ * Copyright (c) 2023 Djordje Nedic
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, subject to the
+ * following conditions:
+ *
+ * The above copyright notice and this permission notice shall
+ * be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of lockfree
+ *
+ * Author:          Djordje Nedic <nedic.djordje2@gmail.com>
+ * Version:         v2.0.8
+ **************************************************************/
+
+/************************** INCLUDE ***************************/
+#ifndef LOCKFREE_QUEUE_HPP
+#define LOCKFREE_QUEUE_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <type_traits>
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#include <optional>
+#endif
+
+namespace lockfree {
+namespace spsc {
+/*************************** TYPES ****************************/
+
+template <typename T, size_t size> class Queue {
+    static_assert(std::is_trivial<T>::value, "The type T must be trivial");
+    static_assert(size > 2, "Buffer size must be bigger than 2");
+
+    /********************** PUBLIC METHODS ************************/
+  public:
+    Queue();
+
+    /**
+     * @brief Adds an element into the queue.
+     * Should only be called from the producer thread.
+     * @param[in] element
+     * @retval Operation success
+     */
+    bool Push(const T &element);
+
+    /**
+     * @brief Removes an element from the queue.
+     * Should only be called from the consumer thread.
+     * @param[out] element
+     * @retval Operation success
+     */
+    bool Pop(T &element);
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+    /**
+     * @brief Removes an element from the queue.
+     * Should only be called from the consumer thread.
+     * @retval Either the element or nothing
+     */
+    std::optional<T> PopOptional();
+#endif
+
+    /********************** PRIVATE MEMBERS ***********************/
+  private:
+    T _data[size]; /**< Data array */
+#if LOCKFREE_CACHE_COHERENT
+    alignas(LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _r; /**< Read index */
+    alignas(
+        LOCKFREE_CACHELINE_LENGTH) std::atomic_size_t _w; /**< Write index */
+#else
+    std::atomic_size_t _r; /**< Read index */
+    std::atomic_size_t _w; /**< Write index */
+#endif
+};
+
+} /* namespace spsc */
+} /* namespace lockfree */
+
+/************************** INCLUDE ***************************/
+
+/* Include the implementation */
+#include "queue_impl.hpp"
+
+#endif /* LOCKFREE_QUEUE_HPP */
--- a/third_party/lockfree/spsc/queue_impl.hpp
+++ b/third_party/lockfree/spsc/queue_impl.hpp
@ -0,0 +1,111 @@
+/**************************************************************
+ * @file queue_impl.hpp
+ * @brief A queue implementation written in standard c++11
+ * suitable for both low-end microcontrollers all the way
+ * to HPC machines. Lock-free for single consumer single
+ * producer scenarios.
+ **************************************************************/
+
+/**************************************************************
+ * Copyright (c) 2023 Djordje Nedic
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to
+ * whom the Software is furnished to do so, subject to the
+ * following conditions:
+ *
+ * The above copyright notice and this permission notice shall
+ * be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of lockfree
+ *
+ * Author:          Djordje Nedic <nedic.djordje2@gmail.com>
+ * Version:         v2.0.8
+ **************************************************************/
+
+namespace lockfree {
+namespace spsc {
+/********************** PUBLIC METHODS ************************/
+
+template <typename T, size_t size> Queue<T, size>::Queue() : _r(0U), _w(0U) {}
+
+template <typename T, size_t size> bool Queue<T, size>::Push(const T &element) {
+    /*
+       The full check needs to be performed using the next write index not to
+       miss the case when the read index wrapped and write index is at the end
+     */
+    const size_t w = _w.load(std::memory_order_relaxed);
+    size_t w_next = w + 1;
+    if (w_next == size) {
+        w_next = 0U;
+    }
+
+    /* Full check  */
+    const size_t r = _r.load(std::memory_order_acquire);
+    if (w_next == r) {
+        return false;
+    }
+
+    /* Place the element */
+    _data[w] = element;
+
+    /* Store the next write index */
+    _w.store(w_next, std::memory_order_release);
+    return true;
+}
+
+template <typename T, size_t size> bool Queue<T, size>::Pop(T &element) {
+    /* Preload indexes with adequate memory ordering */
+    size_t r = _r.load(std::memory_order_relaxed);
+    const size_t w = _w.load(std::memory_order_acquire);
+
+    /* Empty check */
+    if (r == w) {
+        return false;
+    }
+
+    /* Remove the element */
+    element = _data[r];
+
+    /* Increment the read index */
+    r++;
+    if (r == size) {
+        r = 0U;
+    }
+
+    /* Store the read index */
+    _r.store(r, std::memory_order_release);
+    return true;
+}
+
+/********************* std::optional API **********************/
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+template <typename T, size_t size>
+std::optional<T> Queue<T, size>::PopOptional() {
+    T element;
+    bool result = Pop(element);
+
+    if (result) {
+        return element;
+    } else {
+        return {};
+    }
+}
+#endif
+
+} /* namespace spsc */
+} /* namespace lockfree */