diff --git a/.github/mac-bundle.sh b/.github/mac-bundle.sh
new file mode 100755
index 00000000..83947a24
--- /dev/null
+++ b/.github/mac-bundle.sh
@@ -0,0 +1,43 @@
+# Taken from pcsx-redux create-app-bundle.sh
+# For Plist buddy
+PATH="$PATH:/usr/libexec"
+
+
+# Construct the app iconset.
+mkdir alber.iconset
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png
+convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png
+iconutil --convert icns alber.iconset
+
+# Set up the .app directory
+mkdir -p Alber.app/Contents/MacOS/Libraries
+mkdir Alber.app/Contents/Resources
+
+
+# Copy binary into App
+cp ./build/Alber Alber.app/Contents/MacOS/Alber
+chmod a+x Alber.app/Contents/Macos/Alber
+
+# Copy icons into App
+cp alber.icns Alber.app/Contents/Resources/AppIcon.icns
+
+# Fix up Plist stuff
+PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleDisplayName string Alber"
+PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleIconName string AppIcon"
+PlistBuddy Alber.app/Contents/Info.plist -c "add CFBundleIconFile string AppIcon"
+PlistBuddy Alber.app/Contents/Info.plist -c "add NSHighResolutionCapable bool true"
+PlistBuddy Alber.app/Contents/version.plist -c "add ProjectName string Alber"
+
+# Bundle dylibs
+dylibbundler -od -b -x Alber.app/Contents/MacOS/Alber -d Alber.app/Contents/Frameworks/ -p @rpath
+
+# relative rpath
+install_name_tool -add_rpath @loader_path/../Frameworks Alber.app/Contents/MacOS/Alber
\ No newline at end of file
diff --git a/.github/workflows/MacOS_Build.yml b/.github/workflows/MacOS_Build.yml
index 5e0de4bc..d3443faf 100644
--- a/.github/workflows/MacOS_Build.yml
+++ b/.github/workflows/MacOS_Build.yml
@@ -32,8 +32,20 @@ jobs:
       # Build your program with the given configuration
       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
 
-    - name: Upload executable
+    - name: Install bundle dependencies
+      run: brew install dylibbundler imagemagick
+
+    - name: Run bundle script
+      run: ./.github/mac-bundle.sh
+
+    - name: Sign the App
+      run: codesign --force -s - -vvvv Alber.app
+
+    - name: Zip it up
+      run: zip -r Alber Alber.app
+
+    - name: Upload MacOS App
       uses: actions/upload-artifact@v2
       with:
-        name: MacOS executable
-        path: './build/Alber'
+        name: MacOS Alber App Bundle
+        path: 'Alber.zip'
diff --git a/.gitmodules b/.gitmodules
index a2cac3f2..1b629d30 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,3 +25,6 @@
 [submodule "stb"]
 	path = third_party/stb
 	url = https://github.com/nothings/stb
+[submodule "third_party/cmrc"]
+	path = third_party/cmrc
+	url = https://github.com/vector-of-bool/cmrc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d276af52..1d9c5b07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 
 option(DISABLE_PANIC_DEV "Make a build with fewer and less intrusive asserts" OFF)
 option(GPU_DEBUG_INFO "Enable additional GPU debugging info" OFF)
+option(ENABLE_OPENGL "Enable OpenGL rendering backend" ON)
 option(ENABLE_LTO "Enable link-time optimization" OFF)
 option(ENABLE_USER_BUILD "Make a user-facing build. These builds have various assertions disabled, LTO, and more" OFF)
 option(ENABLE_HTTP_SERVER "Enable HTTP server. Used for Discord bot support" OFF)
@@ -45,11 +46,13 @@ set(SDL_STATIC ON CACHE BOOL "" FORCE)
 set(SDL_SHARED OFF CACHE BOOL "" FORCE)
 set(SDL_TEST OFF CACHE BOOL "" FORCE)
 add_subdirectory(third_party/SDL2)
-add_subdirectory(third_party/glad)
+
 add_subdirectory(third_party/toml11)
 include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 
+add_subdirectory(third_party/cmrc)
+
 set(BOOST_ROOT "${CMAKE_SOURCE_DIR}/third_party/boost")
 set(Boost_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/third_party/boost")
 set(Boost_NO_SYSTEM_PATHS ON)
@@ -90,9 +93,10 @@ else()
     message(FATAL_ERROR "Currently unsupported CPU architecture")
 endif()
 
-set(SOURCE_FILES src/main.cpp src/emulator.cpp src/io_file.cpp src/gl_state.cpp src/config.cpp
-                 src/core/CPU/cpu_dynarmic.cpp src/core/CPU/dynarmic_cycles.cpp src/core/memory.cpp
-                 src/httpserver.cpp src/stb_image_write.c
+set(SOURCE_FILES src/main.cpp src/emulator.cpp src/io_file.cpp src/config.cpp
+				 src/core/CPU/cpu_dynarmic.cpp src/core/CPU/dynarmic_cycles.cpp
+				 src/core/memory.cpp src/renderer.cpp src/core/renderer_null/renderer_null.cpp
+				 src/httpserver.cpp src/stb_image_write.c src/core/cheats.cpp src/core/action_replay.cpp
 )
 set(CRYPTO_SOURCE_FILES src/core/crypto/aes_engine.cpp)
 set(KERNEL_SOURCE_FILES src/core/kernel/kernel.cpp src/core/kernel/resource_limits.cpp
@@ -117,38 +121,36 @@ set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA
                       src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
 )
 
-set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp)
-
 set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/lz77.cpp)
 set(FS_SOURCE_FILES src/core/fs/archive_self_ncch.cpp src/core/fs/archive_save_data.cpp src/core/fs/archive_sdmc.cpp
                     src/core/fs/archive_ext_save_data.cpp src/core/fs/archive_ncch.cpp
 )
 
-set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/opengl.hpp include/termcolor.hpp
-                 include/cpu.hpp include/cpu_dynarmic.hpp include/memory.hpp include/kernel/kernel.hpp
+set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
+                 include/cpu.hpp include/cpu_dynarmic.hpp include/memory.hpp include/renderer.hpp include/kernel/kernel.hpp
                  include/dynarmic_cp15.hpp include/kernel/resource_limits.hpp include/kernel/kernel_types.hpp
                  include/kernel/config_mem.hpp include/services/service_manager.hpp include/services/apt.hpp
                  include/kernel/handles.hpp include/services/hid.hpp include/services/fs.hpp
-                 include/services/gsp_gpu.hpp include/services/gsp_lcd.hpp include/arm_defs.hpp
+                 include/services/gsp_gpu.hpp include/services/gsp_lcd.hpp include/arm_defs.hpp include/renderer_null/renderer_null.hpp
                  include/PICA/gpu.hpp include/PICA/regs.hpp include/services/ndm.hpp
                  include/PICA/shader.hpp include/PICA/shader_unit.hpp include/PICA/float_types.hpp
                  include/logger.hpp include/loader/ncch.hpp include/loader/ncsd.hpp include/io_file.hpp
                  include/loader/lz77.hpp include/fs/archive_base.hpp include/fs/archive_self_ncch.hpp
                  include/services/dsp.hpp include/services/cfg.hpp include/services/region_codes.hpp
                  include/fs/archive_save_data.hpp include/fs/archive_sdmc.hpp include/services/ptm.hpp
-                 include/services/mic.hpp include/services/cecd.hpp include/renderer_gl/renderer_gl.hpp
-                 include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp include/services/ac.hpp
+                 include/services/mic.hpp include/services/cecd.hpp include/services/ac.hpp
                  include/services/am.hpp include/services/boss.hpp include/services/frd.hpp include/services/nim.hpp
                  include/fs/archive_ext_save_data.hpp include/services/shared_font.hpp include/fs/archive_ncch.hpp
-                 include/renderer_gl/textures.hpp include/colour.hpp include/services/y2r.hpp include/services/cam.hpp
+                 include/colour.hpp include/services/y2r.hpp include/services/cam.hpp
                  include/services/ldr_ro.hpp include/ipc.hpp include/services/act.hpp include/services/nfc.hpp
                  include/system_models.hpp include/services/dlp_srvr.hpp include/PICA/dynapica/pica_recs.hpp
                  include/PICA/dynapica/x64_regs.hpp include/PICA/dynapica/vertex_loader_rec.hpp include/PICA/dynapica/shader_rec.hpp
                  include/PICA/dynapica/shader_rec_emitter_x64.hpp include/PICA/pica_hash.hpp include/result/result.hpp
                  include/result/result_common.hpp include/result/result_fs.hpp include/result/result_fnd.hpp
                  include/result/result_gsp.hpp include/result/result_kernel.hpp include/result/result_os.hpp
-                 include/crypto/aes_engine.hpp include/metaprogramming.hpp include/PICA/pica_vertex.hpp include/gl_state.hpp
-                 include/config.hpp include/services/ir_user.hpp include/httpserver.hpp
+                 include/crypto/aes_engine.hpp include/metaprogramming.hpp include/PICA/pica_vertex.hpp
+                 include/config.hpp include/services/ir_user.hpp include/httpserver.hpp include/cheats.hpp
+                 include/action_replay.hpp
 )
 
 set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
@@ -160,8 +162,6 @@ set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
                              third_party/cityhash/cityhash.cpp
                              third_party/xxhash/xxhash.c
 )
-
-source_group("Header Files\\Core" FILES ${HEADER_FILES})
 source_group("Source Files\\Core" FILES ${SOURCE_FILES})
 source_group("Source Files\\Core\\Crypto" FILES ${CRYPTO_SOURCE_FILES})
 source_group("Source Files\\Core\\Filesystem" FILES ${FS_SOURCE_FILES})
@@ -169,20 +169,64 @@ source_group("Source Files\\Core\\Kernel" FILES ${KERNEL_SOURCE_FILES})
 source_group("Source Files\\Core\\Loader" FILES ${LOADER_SOURCE_FILES})
 source_group("Source Files\\Core\\Services" FILES ${SERVICE_SOURCE_FILES})
 source_group("Source Files\\Core\\PICA" FILES ${PICA_SOURCE_FILES})
-source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
 source_group("Source Files\\Third Party" FILES ${THIRD_PARTY_SOURCE_FILES})
 
-add_executable(Alber ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES}
-${PICA_SOURCE_FILES} ${RENDERER_GL_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} ${HEADER_FILES})
+set(RENDERER_GL_SOURCE_FILES "") # Empty by default unless we are compiling with the GL renderer
+
+if(ENABLE_OPENGL)
+	add_subdirectory(third_party/glad)
+
+	set(RENDERER_GL_INCLUDE_FILES include/renderer_gl/opengl.hpp
+		include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
+		include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
+		include/renderer_gl/gl_state.hpp
+	)
+
+	set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
+        src/core/renderer_gl/textures.cpp src/core/renderer_gl/etc1.cpp
+        src/core/renderer_gl/gl_state.cpp src/host_shaders/opengl_display.frag
+        src/host_shaders/opengl_display.vert src/host_shaders/opengl_vertex_shader.vert
+        src/host_shaders/opengl_fragment_shader.frag
+	)
+
+    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
+    source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
+
+	cmrc_add_resource_library(
+		resources_renderer_gl
+		NAMESPACE RendererGL
+		WHENCE "src/host_shaders/"
+		"src/host_shaders/opengl_display.frag"
+		"src/host_shaders/opengl_display.vert"
+		"src/host_shaders/opengl_vertex_shader.vert"
+		"src/host_shaders/opengl_fragment_shader.frag"
+	)
+endif()
+
+source_group("Header Files\\Core" FILES ${HEADER_FILES})
+set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES}
+	${PICA_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} ${HEADER_FILES})
+
+if(ENABLE_OPENGL)
+    # Add the OpenGL source files to ALL_SOURCES
+    set(ALL_SOURCES ${ALL_SOURCES} ${RENDERER_GL_SOURCE_FILES})
+endif()
+
+add_executable(Alber ${ALL_SOURCES})
 
 if(ENABLE_LTO OR ENABLE_USER_BUILD)
   set_target_properties(Alber PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-target_link_libraries(Alber PRIVATE dynarmic SDL2-static glad cryptopp)
+target_link_libraries(Alber PRIVATE dynarmic SDL2-static cryptopp)
+
+if(ENABLE_OPENGL)
+    target_compile_definitions(Alber PUBLIC "PANDA3DS_ENABLE_OPENGL=1")
+    target_link_libraries(Alber PRIVATE glad resources_renderer_gl)
+endif()
 
 if(GPU_DEBUG_INFO)
-  target_compile_definitions(Alber PRIVATE GPU_DEBUG_INFO=1)
+    target_compile_definitions(Alber PRIVATE GPU_DEBUG_INFO=1)
 endif()
 
 if(ENABLE_USER_BUILD)
diff --git a/docs/img/alber-icon.ico b/docs/img/alber-icon.ico
new file mode 100644
index 00000000..b6251a0e
Binary files /dev/null and b/docs/img/alber-icon.ico differ
diff --git a/include/PICA/dynapica/shader_rec.hpp b/include/PICA/dynapica/shader_rec.hpp
index b7d37b02..e8b6afed 100644
--- a/include/PICA/dynapica/shader_rec.hpp
+++ b/include/PICA/dynapica/shader_rec.hpp
@@ -21,7 +21,7 @@ class ShaderJIT {
 	ShaderCache cache;
 #endif
 
-public:
+  public:
 #ifdef PANDA3DS_SHADER_JIT_SUPPORTED
 	// Call this before starting to process a batch of vertices
 	// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
@@ -29,9 +29,7 @@ public:
 	// The caller must make sure the entrypoint has been properly set beforehand
 	void prepare(PICAShader& shaderUnit);
 	void reset();
-	void run(PICAShader& shaderUnit) {
-		prologueCallback(shaderUnit, entrypointCallback);
-	}
+	void run(PICAShader& shaderUnit) { prologueCallback(shaderUnit, entrypointCallback); }
 
 	static constexpr bool isAvailable() { return true; }
 #else
@@ -44,7 +42,7 @@ public:
 	}
 
 	// Define dummy callback. This should never be called if the shader JIT is not supported
-	using Callback = void(*)(PICAShader& shaderUnit);
+	using Callback = void (*)(PICAShader& shaderUnit);
 	Callback activeShaderCallback = nullptr;
 
 	void reset() {}
diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
index ba37595a..d22ed371 100644
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@@ -2,17 +2,17 @@
 
 // Only do anything if we're on an x64 target with JIT support enabled
 #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
-#include "helpers.hpp"
-#include "logger.hpp"
-#include "PICA/shader.hpp"
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-#include "x64_regs.hpp"
-
 #include <vector>
 
+#include "PICA/shader.hpp"
+#include "helpers.hpp"
+#include "logger.hpp"
+#include "x64_regs.hpp"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
 class ShaderEmitter : public Xbyak::CodeGenerator {
-	static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
+	static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96;  // How much executable memory to alloc for each shader
 	// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
 	static constexpr size_t allocSize = executableMemorySize + 0x1000;
 
@@ -20,7 +20,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	static constexpr uint noSwizzle = 0x1B;
 
 	using f24 = Floats::f24;
-	using vec4f = OpenGL::Vector<f24, 4>;
+	using vec4f = std::array<f24, 4>;
 
 	// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
 	std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
@@ -33,13 +33,22 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	Label onesVector;
 
-	u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
-	u32 loopLevel = 0;    // The current loop nesting level (0 = not in a loop)
+	u32 recompilerPC = 0;  // PC the recompiler is currently recompiling @
+	u32 loopLevel = 0;     // The current loop nesting level (0 = not in a loop)
 
 	bool haveSSE4_1 = false;  // Shows if the CPU supports SSE4.1
 	bool haveAVX = false;     // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX)
 	bool haveFMA3 = false;    // Shows if the CPU supports FMA3
 
+	// Shows whether the loaded shader has any log2 and exp2 instructions
+	bool codeHasLog2 = false;
+	bool codeHasExp2 = false;
+	
+	Xbyak::Label log2Func, exp2Func;
+	Xbyak::Label emitLog2Func();
+	Xbyak::Label emitExp2Func();
+	Xbyak::util::Cpu cpuCaps;
+
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
 	// Compile instruction "instr"
@@ -49,8 +58,10 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 		const u32 opcode = instruction >> 26;
 		return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU);
 	}
+
 	// Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation
-	void scanForCalls(const PICAShader& shaderUnit);
+	// We also scan for log2/exp2 instructions to see whether to emit the relevant functions
+	void scanCode(const PICAShader& shaderUnit);
 
 	// Load register with number "srcReg" indexed by index "idx" into the xmm register "reg"
 	template <int sourceIndex>
@@ -105,25 +116,27 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 
 	MAKE_LOG_FUNCTION(log, shaderJITLogger)
 
-public:
-	using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
+  public:
+	// Callback type used for instructions
+	using InstructionCallback = const void (*)(PICAShader& shaderUnit);
 	// Callback type used for the JIT prologue. This is what the caller will call
-	using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb);
+	using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb);
+
 	PrologueCallback prologueCb = nullptr;
 
 	// Initialize our emitter with "allocSize" bytes of RWX memory
 	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
-		const auto cpu = Xbyak::util::Cpu();
+		cpuCaps = Xbyak::util::Cpu();
 
-		haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
-		haveAVX = cpu.has(Xbyak::util::Cpu::tAVX);
-		haveFMA3 = cpu.has(Xbyak::util::Cpu::tFMA);
+		haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41);
+		haveAVX = cpuCaps.has(Xbyak::util::Cpu::tAVX);
+		haveFMA3 = cpuCaps.has(Xbyak::util::Cpu::tFMA);
 
-		if (!cpu.has(Xbyak::util::Cpu::tSSE3)) {
+		if (!cpuCaps.has(Xbyak::util::Cpu::tSSE3)) {
 			Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
 		}
 	}
-	
+
 	void compile(const PICAShader& shaderUnit);
 
 	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
@@ -133,9 +146,7 @@ public:
 		return reinterpret_cast<InstructionCallback>(ptr);
 	}
 
-	PrologueCallback getPrologueCallback() {
-		return prologueCb;
-	}
+	PrologueCallback getPrologueCallback() { return prologueCb; }
 };
 
-#endif // x64 recompiler check
\ No newline at end of file
+#endif  // x64 recompiler check
\ No newline at end of file
diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp
index a4adc816..4304a2de 100644
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@@ -1,39 +1,39 @@
 #pragma once
 #include <array>
 
+#include "PICA/dynapica/shader_rec.hpp"
+#include "PICA/float_types.hpp"
+#include "PICA/pica_vertex.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader_unit.hpp"
 #include "config.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
 #include "memory.hpp"
-#include "PICA/float_types.hpp"
-#include "PICA/regs.hpp"
-#include "PICA/shader_unit.hpp"
-#include "PICA/dynapica/shader_rec.hpp"
-#include "renderer_gl/renderer_gl.hpp"
-#include "PICA/pica_vertex.hpp"
+#include "renderer.hpp"
 
 class GPU {
 	static constexpr u32 regNum = 0x300;
-	using vec4f = OpenGL::Vector<Floats::f24, 4>;
+	using vec4f = std::array<Floats::f24, 4>;
 	using Registers = std::array<u32, regNum>;
 
 	Memory& mem;
 	EmulatorConfig& config;
 	ShaderUnit shaderUnit;
-	ShaderJIT shaderJIT; // Doesn't do anything if JIT is disabled or not supported
+	ShaderJIT shaderJIT;  // Doesn't do anything if JIT is disabled or not supported
 
 	u8* vram = nullptr;
 	MAKE_LOG_FUNCTION(log, gpuLogger)
 
-	static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes
+	static constexpr u32 maxAttribCount = 12;  // Up to 12 vertex attributes
 	static constexpr u32 vramSize = u32(6_MB);
-	Registers regs; // GPU internal registers
-	std::array<vec4f, 16> currentAttributes; // Vertex attributes before being passed to the shader
+	Registers regs;                           // GPU internal registers
+	std::array<vec4f, 16> currentAttributes;  // Vertex attributes before being passed to the shader
 
-	std::array<vec4f, 16> immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission
+	std::array<vec4f, 16> immediateModeAttributes;  // Vertex attributes uploaded via immediate mode submission
 	std::array<PICA::Vertex, 3> immediateModeVertices;
 	uint immediateModeVertIndex;
-	uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
+	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading
 
 	template <bool indexed, bool useShaderJIT>
 	void drawArrays();
@@ -42,35 +42,33 @@ class GPU {
 	void drawArrays(bool indexed);
 
 	struct AttribInfo {
-		u32 offset = 0; // Offset from base vertex array
-		int size = 0; // Bytes per vertex
+		u32 offset = 0;  // Offset from base vertex array
+		int size = 0;    // Bytes per vertex
 		u32 config1 = 0;
 		u32 config2 = 0;
-		u32 componentCount = 0; // Number of components for the attribute
+		u32 componentCount = 0;  // Number of components for the attribute
 
-		u64 getConfigFull() {
-			return u64(config1) | (u64(config2) << 32);
-		}
+		u64 getConfigFull() { return u64(config1) | (u64(config2) << 32); }
 	};
 
 	u64 getVertexShaderInputConfig() {
 		return u64(regs[PICA::InternalRegs::VertexShaderInputCfgLow]) | (u64(regs[PICA::InternalRegs::VertexShaderInputCfgHigh]) << 32);
 	}
 
-	std::array<AttribInfo, maxAttribCount> attributeInfo; // Info for each of the 12 attributes
-	u32 totalAttribCount = 0; // Number of vertex attributes to send to VS
-	u32 fixedAttribMask = 0; // Which attributes are fixed?
-	
-	u32 fixedAttribIndex = 0; // Which fixed attribute are we writing to ([0, 11] range)
-	u32 fixedAttribCount = 0; // How many attribute components have we written? When we get to 4 the attr will actually get submitted
-	std::array<u32, 3> fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted
+	std::array<AttribInfo, maxAttribCount> attributeInfo;  // Info for each of the 12 attributes
+	u32 totalAttribCount = 0;                              // Number of vertex attributes to send to VS
+	u32 fixedAttribMask = 0;                               // Which attributes are fixed?
+
+	u32 fixedAttribIndex = 0;          // Which fixed attribute are we writing to ([0, 11] range)
+	u32 fixedAttribCount = 0;          // How many attribute components have we written? When we get to 4 the attr will actually get submitted
+	std::array<u32, 3> fixedAttrBuff;  // Buffer to hold fixed attributes in until they get submitted
 
 	// Command processor pointers for GPU command lists
 	u32* cmdBuffStart = nullptr;
 	u32* cmdBuffEnd = nullptr;
 	u32* cmdBuffCurr = nullptr;
 
-	Renderer renderer;
+	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();
 
   public:
@@ -84,11 +82,10 @@ class GPU {
 	// Set to false by the renderer when the lighting_lut is uploaded ot the GPU
 	bool lightingLUTDirty = false;
 
-	GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config);
-	void initGraphicsContext() { renderer.initGraphicsContext(); }
-	void getGraphicsContext() { renderer.getGraphicsContext(); }
-	void display() { renderer.display(); }
-	void screenshot(const std::string& name) { renderer.screenshot(name); }
+	GPU(Memory& mem, EmulatorConfig& config);
+	void initGraphicsContext() { renderer->initGraphicsContext(); }
+	void display() { renderer->display(); }
+	void screenshot(const std::string& name) { renderer->screenshot(name); }
 
 	void fireDMA(u32 dest, u32 source, u32 size);
 	void reset();
@@ -107,13 +104,13 @@ class GPU {
 	// TODO: Emulate the transfer engine & its registers
 	// Then this can be emulated by just writing the appropriate values there
 	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
-		renderer.clearBuffer(startAddress, endAddress, value, control);
+		renderer->clearBuffer(startAddress, endAddress, value, control);
 	}
 
 	// TODO: Emulate the transfer engine & its registers
 	// Then this can be emulated by just writing the appropriate values there
 	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
-		renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
+		renderer->displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
 	}
 
 	// Read a value of type T from physical address paddr
@@ -132,17 +129,17 @@ class GPU {
 
 	// Get a pointer of type T* to the data starting from physical address paddr
 	template <typename T>
-	T* getPointerPhys(u32 paddr) {
-		if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) {
+	T* getPointerPhys(u32 paddr, u32 size = 0) {
+		if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) {
 			u8* fcram = mem.getFCRAM();
 			u32 index = paddr - PhysicalAddrs::FCRAM;
 
 			return (T*)&fcram[index];
-		} else if (paddr >= PhysicalAddrs::VRAM && paddr <= PhysicalAddrs::VRAMEnd) {
+		} else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) {
 			u32 index = paddr - PhysicalAddrs::VRAM;
 			return (T*)&vram[index];
 		} else [[unlikely]] {
 			Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr);
 		}
 	}
-};
\ No newline at end of file
+};
diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index ad1e0e46..0f3154f1 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -2,13 +2,14 @@
 #include <algorithm>
 #include <array>
 #include <cstring>
-#include "helpers.hpp"
-#include "opengl.hpp"
+
 #include "PICA/float_types.hpp"
 #include "PICA/pica_hash.hpp"
+#include "helpers.hpp"
 
 enum class ShaderType {
-	Vertex, Geometry
+	Vertex,
+	Geometry,
 };
 
 namespace ShaderOpcodes {
@@ -46,66 +47,66 @@ namespace ShaderOpcodes {
 		SETEMIT = 0x2B,
 		JMPC = 0x2C,
 		JMPU = 0x2D,
-		CMP1 = 0x2E, // Both of these instructions are CMP
+		CMP1 = 0x2E,  // Both of these instructions are CMP
 		CMP2 = 0x2F,
-		MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it
+		MAD = 0x38  // Everything between 0x38-0x3F is a MAD but fuck it
 	};
 }
 
 // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
 class PICAShader {
 	using f24 = Floats::f24;
-	using vec4f = OpenGL::Vector<f24, 4>;
+	using vec4f = std::array<f24, 4>;
 
 	struct Loop {
-		u32 startingPC; // PC at the start of the loop
-		u32 endingPC;   // PC at the end of the loop
-		u32 iterations; // How many iterations of the loop to run
-		u32 increment;  // How much to increment the loop counter after each iteration
+		u32 startingPC;  // PC at the start of the loop
+		u32 endingPC;    // PC at the end of the loop
+		u32 iterations;  // How many iterations of the loop to run
+		u32 increment;   // How much to increment the loop counter after each iteration
 	};
 
 	// Info for ifc/ifu stack
 	struct ConditionalInfo {
-		u32 endingPC; // PC at the end of the if block (= DST)
-		u32 newPC; // PC after the if block is done executing (= DST + NUM)
+		u32 endingPC;  // PC at the end of the if block (= DST)
+		u32 newPC;     // PC after the if block is done executing (= DST + NUM)
 	};
 
 	struct CallInfo {
-		u32 endingPC; // PC at the end of the function
-		u32 returnPC; // PC to return to after the function ends
+		u32 endingPC;  // PC at the end of the function
+		u32 returnPC;  // PC to return to after the function ends
 	};
 
-	int bufferIndex; // Index of the next instruction to overwrite for shader uploads
-	int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite
-	u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range)
-	u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer?
-	bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform?
+	int bufferIndex;                  // Index of the next instruction to overwrite for shader uploads
+	int opDescriptorIndex;            // Index of the next operand descriptor we'll overwrite
+	u32 floatUniformIndex = 0;        // Which float uniform are we writing to? ([0, 95] range)
+	u32 floatUniformWordCount = 0;    // How many words have we buffered for the current uniform transfer?
+	bool f32UniformTransfer = false;  // Are we transferring an f32 uniform or an f24 uniform?
 
-	std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data
+	std::array<u32, 4> floatUniformBuffer;  // Buffer for temporarily caching float uniform data
 
-public:
+  public:
 	// These are placed close to the temp registers and co because it helps the JIT generate better code
-	u32 entrypoint = 0; // Initial shader PC
+	u32 entrypoint = 0;  // Initial shader PC
 	u32 boolUniform;
-	std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
+	std::array<std::array<u8, 4>, 4> intUniforms;
 	alignas(16) std::array<vec4f, 96> floatUniforms;
 
-	alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
-	alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
+	alignas(16) std::array<vec4f, 16> fixedAttributes;  // Fixed vertex attributes
+	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
-	alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT
+	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
 
-protected:
+  protected:
 	std::array<u32, 128> operandDescriptors;
-	alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
-	OpenGL::Vector<s32, 2> addrRegister; // Address register
-	bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
+	alignas(16) std::array<vec4f, 16> tempRegisters;  // General purpose registers the shader can use for temp values
+	std::array<s32, 2> addrRegister;                  // Address register
+	bool cmpRegister[2];                              // Comparison registers where the result of CMP is stored in
 	u32 loopCounter;
 
-	u32 pc = 0; // Program counter: Index of the next instruction we're going to execute
-	u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full)
-	u32 ifIndex = 0; // The index of our IF stack
-	u32 callIndex = 0; // The index of our CALL stack
+	u32 pc = 0;         // Program counter: Index of the next instruction we're going to execute
+	u32 loopIndex = 0;  // The index of our loop stack (0 = empty, 4 = full)
+	u32 ifIndex = 0;    // The index of our IF stack
+	u32 callIndex = 0;  // The index of our CALL stack
 
 	std::array<Loop, 4> loopInfo;
 	std::array<ConditionalInfo, 8> conditionalInfo;
@@ -117,7 +118,7 @@ protected:
 	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
 	using Hash = PICAHash::HashType;
 
-	Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
+	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)
 
 	bool codeHashDirty = false;
@@ -130,7 +131,7 @@ protected:
 	vec4f getSource(u32 source);
 	vec4f& getDest(u32 dest);
 
-private:
+  private:
 	// Interpreter functions for the various shader functions
 	void add(u32 instruction);
 	void call(u32 instruction);
@@ -171,13 +172,13 @@ private:
 		bool negate;
 
 		using namespace Helpers;
-		if constexpr (sourceIndex == 1) { // SRC1
+		if constexpr (sourceIndex == 1) {  // SRC1
 			negate = (getBit<4>(opDescriptor)) != 0;
 			compSwizzle = getBits<5, 8>(opDescriptor);
-		} else if constexpr (sourceIndex == 2) { // SRC2
+		} else if constexpr (sourceIndex == 2) {  // SRC2
 			negate = (getBit<13>(opDescriptor)) != 0;
 			compSwizzle = getBits<14, 8>(opDescriptor);
-		} else if constexpr (sourceIndex == 3) { // SRC3
+		} else if constexpr (sourceIndex == 3) {  // SRC3
 			negate = (getBit<22>(opDescriptor)) != 0;
 			compSwizzle = getBits<23, 8>(opDescriptor);
 		}
@@ -185,8 +186,8 @@ private:
 		// Iterate through every component of the swizzled vector in reverse order
 		// And get which source component's index to match it with
 		for (int comp = 0; comp < 4; comp++) {
-			int index = compSwizzle & 3; // Get index for this component
-			compSwizzle >>= 2; // Move to next component index
+			int index = compSwizzle & 3;  // Get index for this component
+			compSwizzle >>= 2;            // Move to next component index
 			ret[3 - comp] = source[index];
 		}
 
@@ -212,39 +213,35 @@ private:
 	u8 getIndexedSource(u32 source, u32 index);
 	bool isCondTrue(u32 instruction);
 
-public:
+  public:
 	static constexpr size_t maxInstructionCount = 4096;
-	std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
-	std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
+	std::array<u32, maxInstructionCount> loadedShader;    // Currently loaded & active shader
+	std::array<u32, maxInstructionCount> bufferedShader;  // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
 
 	PICAShader(ShaderType type) : type(type) {}
 
 	// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
-	void finalize() {
-		std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32));
-	}
+	void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); }
 
-	void setBufferIndex(u32 index) {
-		bufferIndex = index & 0xfff;
-	}
-
-	void setOpDescriptorIndex(u32 index) {
-		opDescriptorIndex = index & 0x7f;
-	}
+	void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; }
+	void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; }
 
 	void uploadWord(u32 word) {
-		if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew");
+		if (bufferIndex >= 4095) {
+			Helpers::panic("o no, shader upload overflew");
+		}
+
 		bufferedShader[bufferIndex++] = word;
 		bufferIndex &= 0xfff;
 
-		codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed
+		codeHashDirty = true;  // Signal the JIT if necessary that the program hash has potentially changed
 	}
 
 	void uploadDescriptor(u32 word) {
 		operandDescriptors[opDescriptorIndex++] = word;
 		opDescriptorIndex &= 0x7f;
 
-		opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed
+		opdescHashDirty = true;  // Signal the JIT if necessary that the program hash has potentially changed
 	}
 
 	void setFloatUniformIndex(u32 word) {
@@ -255,23 +252,24 @@ public:
 
 	void uploadFloatUniform(u32 word) {
 		floatUniformBuffer[floatUniformWordCount++] = word;
-		if (floatUniformIndex >= 96)
+		if (floatUniformIndex >= 96) {
 			Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex);
+		}
 
 		if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) {
 			vec4f& uniform = floatUniforms[floatUniformIndex++];
 			floatUniformWordCount = 0;
 
 			if (f32UniformTransfer) {
-				uniform.x() = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
-				uniform.y() = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
-				uniform.z() = f24::fromFloat32(*(float*)&floatUniformBuffer[1]);
-				uniform.w() = f24::fromFloat32(*(float*)&floatUniformBuffer[0]);
+				uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
+				uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
+				uniform[2] = f24::fromFloat32(*(float*)&floatUniformBuffer[1]);
+				uniform[3] = f24::fromFloat32(*(float*)&floatUniformBuffer[0]);
 			} else {
-				uniform.x() = f24::fromRaw(floatUniformBuffer[2] & 0xffffff);
-				uniform.y() = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24));
-				uniform.z() = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
-				uniform.w() = f24::fromRaw(floatUniformBuffer[0] >> 8);
+				uniform[0] = f24::fromRaw(floatUniformBuffer[2] & 0xffffff);
+				uniform[1] = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24));
+				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
+				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
 		}
 	}
@@ -280,10 +278,10 @@ public:
 		using namespace Helpers;
 
 		auto& u = intUniforms[index];
-		u.x() = word & 0xff;
-		u.y() = getBits<8, 8>(word);
-		u.z() = getBits<16, 8>(word);
-		u.w() = getBits<24, 8>(word);
+		u[0] = word & 0xff;
+		u[1] = getBits<8, 8>(word);
+		u[2] = getBits<16, 8>(word);
+		u[3] = getBits<24, 8>(word);
 	}
 
 	void run();
diff --git a/include/action_replay.hpp b/include/action_replay.hpp
new file mode 100644
index 00000000..a6b97df9
--- /dev/null
+++ b/include/action_replay.hpp
@@ -0,0 +1,52 @@
+#pragma once
+#include <array>
+#include <bitset>
+#include <vector>
+
+#include "helpers.hpp"
+#include "memory.hpp"
+#include "services/hid.hpp"
+
+class ActionReplay {
+	using Cheat = std::vector<u32>;  // A cheat is really just a bunch of 64-bit opcodes neatly encoded into 32-bit chunks
+	static constexpr size_t ifStackSize = 32; // TODO: How big is this, really?
+
+	u32 offset1, offset2;    // Memory offset registers. Non-persistent.
+	u32 data1, data2;        // Data offset registers. Non-persistent.
+	u32 storage1, storage2;  // Storage registers. Persistent.
+
+	// When an instruction does not specify which offset or data register to use, we use the "active" one
+	// Which is by default #1 and may be changed by certain AR operations
+	u32 *activeOffset, *activeData, *activeStorage;
+	u32 ifStackIndex;    // Our index in the if stack. Shows how many entries we have at the moment.
+	u32 loopStackIndex;  // Same but for loops
+	std::bitset<32> ifStack;
+
+	// Program counter
+	u32 pc = 0;
+	Memory& mem;
+	HIDService& hid;
+
+	// Has the cheat ended?
+	bool running = false;
+	// Run 1 AR instruction
+	void runInstruction(const Cheat& cheat, u32 instruction);
+
+	// Action Replay has a billion D-type opcodes so this handles all of them
+	void executeDType(const Cheat& cheat, u32 instruction);
+
+	u8 read8(u32 addr);
+	u16 read16(u32 addr);
+	u32 read32(u32 addr);
+
+	void write8(u32 addr, u8 value);
+	void write16(u32 addr, u16 value);
+	void write32(u32 addr, u32 value);
+
+	void pushConditionBlock(bool condition);
+
+  public:
+	ActionReplay(Memory& mem, HIDService& hid);
+	void runCheat(const Cheat& cheat);
+	void reset();
+};
\ No newline at end of file
diff --git a/include/cheats.hpp b/include/cheats.hpp
new file mode 100644
index 00000000..6ada7d20
--- /dev/null
+++ b/include/cheats.hpp
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <vector>
+
+#include "action_replay.hpp"
+#include "helpers.hpp"
+#include "services/hid.hpp"
+
+// Forward-declare this since it's just passed and we don't want to include memory.hpp and increase compile time
+class Memory;
+
+class Cheats {
+  public:
+	enum class CheatType {
+		ActionReplay,  // CTRPF cheats
+		Gateway,
+	};
+
+	struct Cheat {
+		CheatType type;
+		std::vector<u32> instructions;
+	};
+
+	Cheats(Memory& mem, HIDService& hid);
+	void addCheat(const Cheat& cheat);
+	void reset();
+	void run();
+
+  private:
+	ActionReplay ar;  // An ActionReplay cheat machine for executing CTRPF codes
+	std::vector<Cheat> cheats;
+};
\ No newline at end of file
diff --git a/include/config.hpp b/include/config.hpp
index bdb697bf..6bccdad6 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -1,10 +1,14 @@
 #pragma once
 #include <filesystem>
 
+#include "renderer.hpp"
+
 // Remember to initialize every field here to its default value otherwise bad things will happen
 struct EmulatorConfig {
 	bool shaderJitEnabled = false;
+	RendererType rendererType = RendererType::OpenGL;
 
+	EmulatorConfig(const std::filesystem::path& path);
 	void load(const std::filesystem::path& path);
 	void save(const std::filesystem::path& path);
 };
\ No newline at end of file
diff --git a/include/emulator.hpp b/include/emulator.hpp
index 83b832f6..040b93b2 100644
--- a/include/emulator.hpp
+++ b/include/emulator.hpp
@@ -1,39 +1,51 @@
 #pragma once
 
 #include <SDL.h>
-#include <glad/gl.h>
 
 #include <filesystem>
 #include <fstream>
 #include <optional>
 
 #include "PICA/gpu.hpp"
-#include "cpu.hpp"
+#include "cheats.hpp"
 #include "config.hpp"
+#include "cpu.hpp"
 #include "crypto/aes_engine.hpp"
 #include "io_file.hpp"
 #include "memory.hpp"
-#include "gl_state.hpp"
+
 #ifdef PANDA3DS_ENABLE_HTTP_SERVER
 #include "httpserver.hpp"
 #endif
 
-enum class ROMType { None, ELF, NCSD, CXI };
+enum class ROMType {
+	None,
+	ELF,
+	NCSD,
+	CXI,
+};
 
 class Emulator {
+	EmulatorConfig config;
 	CPU cpu;
 	GPU gpu;
 	Memory memory;
 	Kernel kernel;
 	Crypto::AESEngine aesEngine;
+	Cheats cheats;
 
-	GLStateManager gl;
-	EmulatorConfig config;
 	SDL_Window* window;
+
+#ifdef PANDA3DS_ENABLE_OPENGL
 	SDL_GLContext glContext;
+#endif
+
 	SDL_GameController* gameController = nullptr;
 	int gameControllerID;
 
+	// Shows whether we've loaded any action replay codes
+	bool haveCheats = false;
+
 	// Variables to keep track of whether the user is controlling the 3DS analog stick with their keyboard
 	// This is done so when a gamepad is connected, we won't automatically override the 3DS analog stick settings with the gamepad's state
 	// And so the user can still use the keyboard to control the analog
@@ -63,8 +75,8 @@ class Emulator {
   public:
 	// Decides whether to reload or not reload the ROM when resetting. We use enum class over a plain bool for clarity.
 	// If NoReload is selected, the emulator will not reload its selected ROM. This is useful for things like booting up the emulator, or resetting to
-	// change ROMs. If Reload is selected, the emulator will reload its selected ROM. This is useful for eg a "reset" button that keeps the current ROM
-	// and just resets the emu
+	// change ROMs. If Reload is selected, the emulator will reload its selected ROM. This is useful for eg a "reset" button that keeps the current
+	// ROM and just resets the emu
 	enum class ReloadOption { NoReload, Reload };
 
 	Emulator();
diff --git a/include/fs/archive_base.hpp b/include/fs/archive_base.hpp
index e1b4caa0..0b0f65a1 100644
--- a/include/fs/archive_base.hpp
+++ b/include/fs/archive_base.hpp
@@ -116,15 +116,34 @@ struct ArchiveSession {
     ArchiveSession(ArchiveBase* archive, const FSPath& filePath, bool isOpen = true) : archive(archive), path(filePath), isOpen(isOpen) {}
 };
 
-struct DirectorySession {
-    ArchiveBase* archive = nullptr;
-    // For directories which are mirrored to a specific path on the disk, this contains that path
-    // Otherwise this is a nullopt
-    std::optional<std::filesystem::path> pathOnDisk;
-    bool isOpen;
+struct DirectoryEntry {
+	std::filesystem::path path;
+	bool isDirectory;
+};
 
-    DirectorySession(ArchiveBase* archive, std::filesystem::path path, bool isOpen = true) : archive(archive), pathOnDisk(path),
-        isOpen(isOpen) {}
+struct DirectorySession {
+	ArchiveBase* archive = nullptr;
+	// For directories which are mirrored to a specific path on the disk, this contains that path
+	// Otherwise this is a nullopt
+	std::optional<std::filesystem::path> pathOnDisk;
+
+	// The list of directory entries + the index of the entry we're currently inspecting
+	std::vector<DirectoryEntry> entries;
+	size_t currentEntry;
+
+	bool isOpen;
+
+	DirectorySession(ArchiveBase* archive, std::filesystem::path path, bool isOpen = true) : archive(archive), pathOnDisk(path), isOpen(isOpen) {
+		currentEntry = 0;  // Start from entry 0
+
+		// Read all directory entries, cache them
+		for (auto& e : std::filesystem::directory_iterator(path)) {
+			DirectoryEntry entry;
+			entry.path = e.path();
+			entry.isDirectory = std::filesystem::is_directory(e);
+			entries.push_back(entry);
+		}
+	}
 };
 
 // Represents a file descriptor obtained from OpenFile. If the optional is nullopt, opening the file failed.
diff --git a/include/renderer.hpp b/include/renderer.hpp
new file mode 100644
index 00000000..e14afcea
--- /dev/null
+++ b/include/renderer.hpp
@@ -0,0 +1,66 @@
+#pragma once
+#include <array>
+#include <span>
+#include <optional>
+
+#include "PICA/pica_vertex.hpp"
+#include "PICA/regs.hpp"
+#include "helpers.hpp"
+
+enum class RendererType : s8 {
+	// Todo: Auto = -1,
+	Null = 0,
+	OpenGL = 1,
+	Vulkan = 2,
+};
+
+class GPU;
+
+class Renderer {
+  protected:
+	GPU& gpu;
+	static constexpr u32 regNum = 0x300;  // Number of internal PICA registers
+	const std::array<u32, regNum>& regs;
+
+	std::array<u32, 2> fbSize;  // The size of the framebuffer (ie both the colour and depth buffer)'
+
+	u32 colourBufferLoc;                // Location in 3DS VRAM for the colour buffer
+	PICA::ColorFmt colourBufferFormat;  // Format of the colours stored in the colour buffer
+
+	// Same for the depth/stencil buffer
+	u32 depthBufferLoc;
+	PICA::DepthFmt depthBufferFormat;
+
+  public:
+	Renderer(GPU& gpu, const std::array<u32, regNum>& internalRegs);
+	virtual ~Renderer();
+
+	static constexpr u32 vertexBufferSize = 0x10000;
+	static std::optional<RendererType> typeFromString(std::string inString);
+	static const char* typeToString(RendererType rendererType);
+
+	virtual void reset() = 0;
+	virtual void display() = 0;                                                              // Display the 3DS screen contents to the window
+	virtual void initGraphicsContext() = 0;                                                  // Initialize graphics context
+	virtual void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) = 0;  // Clear a GPU buffer in VRAM
+	virtual void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) = 0;  // Perform display transfer
+	virtual void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) = 0;             // Draw the given vertices
+
+	virtual void screenshot(const std::string& name) = 0;
+
+	void setFBSize(u32 width, u32 height) {
+		fbSize[0] = width;
+		fbSize[1] = height;
+	}
+
+	void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; }
+	void setDepthFormat(PICA::DepthFmt format) {
+		if (format == PICA::DepthFmt::Unknown1) {
+			Helpers::panic("[PICA] Undocumented depth-stencil mode!");
+		}
+		depthBufferFormat = format;
+	}
+
+	void setColourBufferLoc(u32 loc) { colourBufferLoc = loc; }
+	void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; }
+};
\ No newline at end of file
diff --git a/include/gl_state.hpp b/include/renderer_gl/gl_state.hpp
similarity index 100%
rename from include/gl_state.hpp
rename to include/renderer_gl/gl_state.hpp
diff --git a/include/opengl.hpp b/include/renderer_gl/opengl.hpp
similarity index 100%
rename from include/opengl.hpp
rename to include/renderer_gl/opengl.hpp
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 07f8a63c..15d12ade 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -1,23 +1,23 @@
 #pragma once
+
 #include <array>
 #include <span>
-#include <stb_image_write.h>
 
 #include "PICA/float_types.hpp"
+#include "PICA/pica_vertex.hpp"
+#include "PICA/regs.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
+#include "renderer.hpp"
 #include "surface_cache.hpp"
 #include "textures.hpp"
-#include "PICA/regs.hpp"
-#include "PICA/pica_vertex.hpp"
 
 // More circular dependencies!
 class GPU;
 
-class Renderer {
-	GPU& gpu;
-	GLStateManager& gl;
+class RendererGL final : public Renderer {
+	GLStateManager gl = {};
 
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;
@@ -31,7 +31,7 @@ class Renderer {
 	GLint textureEnvCombinerLoc = -1;
 	GLint textureEnvColorLoc = -1;
 	GLint textureEnvScaleLoc = -1;
-	
+
 	// Uniform of PICA registers
 	GLint picaRegLoc = -1;
 
@@ -48,22 +48,10 @@ class Renderer {
 	SurfaceCache<ColourBuffer, 10, true> colourBufferCache;
 	SurfaceCache<Texture, 256, true> textureCache;
 
-	OpenGL::uvec2 fbSize;  // The size of the framebuffer (ie both the colour and depth buffer)'
-
-	u32 colourBufferLoc;                 // Location in 3DS VRAM for the colour buffer
-	PICA::ColorFmt colourBufferFormat;  // Format of the colours stored in the colour buffer
-
-	// Same for the depth/stencil buffer
-	u32 depthBufferLoc;
-	PICA::DepthFmt depthBufferFormat;
-
 	// Dummy VAO/VBO for blitting the final output
 	OpenGL::VertexArray dummyVAO;
 	OpenGL::VertexBuffer dummyVBO;
 
-	static constexpr u32 regNum = 0x300;  // Number of internal PICA registers
-	const std::array<u32, regNum>& regs;
-
 	OpenGL::Texture screenTexture;
 	GLuint lightLUTTextureArray;
 	OpenGL::Framebuffer screenFramebuffer;
@@ -79,34 +67,16 @@ class Renderer {
 	void updateLightingLUT();
 
   public:
-	Renderer(GPU& gpu, GLStateManager& gl, const std::array<u32, regNum>& internalRegs) : gpu(gpu), gl(gl), regs(internalRegs) {}
+	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs) : Renderer(gpu, internalRegs) {}
+	~RendererGL() override;
 
-	void reset();
-	void display();                                                                                 // Display the 3DS screen contents to the window
-	void initGraphicsContext();                                                                     // Initialize graphics context
-	void getGraphicsContext();                                                                      // Set up graphics context for rendering
-	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control);                     // Clear a GPU buffer in VRAM
-	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags);  // Perform display transfer
-	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices);             // Draw the given vertices
+	void reset() override;
+	void display() override;                                                              // Display the 3DS screen contents to the window
+	void initGraphicsContext() override;                                                  // Initialize graphics context
+	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;  // Clear a GPU buffer in VRAM
+	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;  // Perform display transfer
+	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;             // Draw the given vertices
 
 	// Take a screenshot of the screen and store it in a file
-	void screenshot(const std::string& name);
-
-	void setFBSize(u32 width, u32 height) {
-		fbSize.x() = width;
-		fbSize.y() = height;
-	}
-
-	void setColourFormat(PICA::ColorFmt format) { colourBufferFormat = format; }
-	void setDepthFormat(PICA::DepthFmt format) {
-		if (format == PICA::DepthFmt::Unknown1) {
-			Helpers::panic("[PICA] Undocumented depth-stencil mode!");
-		}
-		depthBufferFormat = format;
-	}
-
-	void setColourBufferLoc(u32 loc) { colourBufferLoc = loc; }
-	void setDepthBufferLoc(u32 loc) { depthBufferLoc = loc; }
-
-	static constexpr u32 vertexBufferSize = 0x10000;
+	void screenshot(const std::string& name) override;
 };
\ No newline at end of file
diff --git a/include/renderer_gl/textures.hpp b/include/renderer_gl/textures.hpp
index 5469a59f..a2b6c09d 100644
--- a/include/renderer_gl/textures.hpp
+++ b/include/renderer_gl/textures.hpp
@@ -40,11 +40,11 @@ struct Texture {
 
     void allocate();
     void setNewConfig(u32 newConfig);
-    void decodeTexture(const void* data);
+    void decodeTexture(std::span<const u8> data);
     void free();
     u64 sizeInBytes();
 
-    u32 decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data);
+    u32 decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
 
     // Get the morton interleave offset of a texel based on its U and V values
     static u32 mortonInterleave(u32 u, u32 v);
@@ -59,6 +59,6 @@ struct Texture {
 
     // Returns the texel at coordinates (u, v) of an ETC1(A4) texture
     // TODO: Make hasAlpha a template parameter
-    u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data);
+    u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span<const u8> data);
     u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData);
-};
\ No newline at end of file
+};
diff --git a/include/renderer_null/renderer_null.hpp b/include/renderer_null/renderer_null.hpp
new file mode 100644
index 00000000..29080786
--- /dev/null
+++ b/include/renderer_null/renderer_null.hpp
@@ -0,0 +1,17 @@
+#include "renderer.hpp"
+
+class GPU;
+
+class RendererNull final : public Renderer {
+  public:
+	RendererNull(GPU& gpu, const std::array<u32, regNum>& internalRegs);
+	~RendererNull() override;
+
+	void reset() override;
+	void display() override;
+	void initGraphicsContext() override;
+	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
+	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
+	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
+	void screenshot(const std::string& name) override;
+};
\ No newline at end of file
diff --git a/include/services/hid.hpp b/include/services/hid.hpp
index 6a3aab95..23a36ec6 100644
--- a/include/services/hid.hpp
+++ b/include/services/hid.hpp
@@ -91,6 +91,7 @@ class HIDService {
 	void pressKey(u32 mask) { newButtons |= mask; }
 	void releaseKey(u32 mask) { newButtons &= ~mask; }
 
+	u32 getOldButtons() { return oldButtons; }
 	s16 getCirclepadX() { return circlePadX; }
 	s16 getCirclepadY() { return circlePadY; }
 
diff --git a/include/services/service_manager.hpp b/include/services/service_manager.hpp
index 1d93641c..51d6d554 100644
--- a/include/services/service_manager.hpp
+++ b/include/services/service_manager.hpp
@@ -90,17 +90,5 @@ class ServiceManager {
 	void signalDSPEvents() { dsp.signalEvents(); }
 
 	// Input function wrappers
-	void pressKey(u32 key) { hid.pressKey(key); }
-	void releaseKey(u32 key) { hid.releaseKey(key); }
-	s16 getCirclepadX() { return hid.getCirclepadX(); }
-	s16 getCirclepadY() { return hid.getCirclepadY(); }
-	void setCirclepadX(s16 x) { hid.setCirclepadX(x); }
-	void setCirclepadY(s16 y) { hid.setCirclepadY(y); }
-	void updateInputs(u64 currentTimestamp) { hid.updateInputs(currentTimestamp); }
-	void setTouchScreenPress(u16 x, u16 y) { hid.setTouchScreenPress(x, y); }
-	void releaseTouchScreen() { hid.releaseTouchScreen(); }
-
-	void setRoll(s16 roll) { hid.setRoll(roll); }
-	void setPitch(s16 pitch) { hid.setPitch(pitch); }
-	void setYaw(s16 yaw) { hid.setYaw(yaw); }
+	HIDService& getHID() { return hid; }
 };
diff --git a/readme.md b/readme.md
index 9c98178d..854267b6 100644
--- a/readme.md
+++ b/readme.md
@@ -83,6 +83,7 @@ Panda3DS also supports controller input using the SDL2 GameController API.
 - [Corgi3DS](https://github.com/PSI-Rockin/Corgi3DS), an LLE 3DS emulator which both served as an inspiration, as well as a nice source of documentation for some PICA200-related things
 
 # Sister Projects
+- [Dynarmic](https://github.com/merryhime/dynarmic): An arm32/arm64 to x86-64/ARMv8 recompiler
 - [PCSX-Redux](https://github.com/grumpycoders/pcsx-redux): A PlayStation 1 emulator targetting developers, reverse engineers and regular PS1 fans alike
 - [SkyEmu](https://github.com/skylersaleh/SkyEmu): A seagull-themed low-level GameBoy, GameBoy Color, GameBoy Advance and Nintendo DS emulator that is designed to be easy to use, cross platform and accurate.
 - [NanoBoyAdvance](https://github.com/nba-emu/NanoBoyAdvance): A Game Boy Advance emulator focusing on hardware research and cycle-accurate emulation
diff --git a/src/config.cpp b/src/config.cpp
index 6c9a8450..a5e9330c 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -1,6 +1,7 @@
 #include "config.hpp"
 
 #include <fstream>
+#include <string>
 
 #include "helpers.hpp"
 #include "toml.hpp"
@@ -9,6 +10,8 @@
 // We are legally allowed, as per the author's wish, to use the above code without any licensing restrictions
 // However we still want to follow the license as closely as possible and offer the proper attributions.
 
+EmulatorConfig::EmulatorConfig(const std::filesystem::path& path) { load(path); }
+
 void EmulatorConfig::load(const std::filesystem::path& path) {
 	// If the configuration file does not exist, create it and return
 	std::error_code error;
@@ -31,6 +34,17 @@ void EmulatorConfig::load(const std::filesystem::path& path) {
 		if (gpuResult.is_ok()) {
 			auto gpu = gpuResult.unwrap();
 
+			// Get renderer
+			auto rendererName = toml::find_or<std::string>(gpu, "Renderer", "OpenGL");
+			auto configRendererType = Renderer::typeFromString(rendererName);
+
+			if (configRendererType.has_value()) {
+				rendererType = configRendererType.value();
+			} else {
+				Helpers::warn("Invalid renderer specified: %s\n", rendererName.c_str());
+				rendererType = RendererType::OpenGL;
+			}
+
 			shaderJitEnabled = toml::find_or<toml::boolean>(gpu, "EnableShaderJIT", false);
 		}
 	}
@@ -43,7 +57,7 @@ void EmulatorConfig::save(const std::filesystem::path& path) {
 	if (std::filesystem::exists(path, error)) {
 		try {
 			data = toml::parse<toml::preserve_comments>(path);
-		} catch (std::exception& ex) {
+		} catch (const std::exception& ex) {
 			Helpers::warn("Exception trying to parse config file. Exception: %s\n", ex.what());
 			return;
 		}
@@ -55,6 +69,7 @@ void EmulatorConfig::save(const std::filesystem::path& path) {
 	}
 
 	data["GPU"]["EnableShaderJIT"] = shaderJitEnabled;
+	data["GPU"]["Renderer"] = std::string(Renderer::typeToString(rendererType));
 
 	std::ofstream file(path, std::ios::out);
 	file << data;
diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
index 06247950..13eb630e 100644
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@@ -61,11 +61,14 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 
 	// Tail call to shader code entrypoint
 	jmp(arg2);
-	align(16);
-	// Scan the shader code for call instructions and add them to the list of possible return PCs. We need to do this because the PICA callstack works
-	// Pretty weirdly
-	scanForCalls(shaderUnit);
 
+	// Scan the code for call, exp2, log2, etc instructions which need some special care
+	// After that, emit exp2 and log2 functions if the corresponding instructions are present
+	scanCode(shaderUnit);
+	if (codeHasExp2) exp2Func = emitExp2Func();
+	if (codeHasLog2) log2Func = emitLog2Func();
+
+	align(16);
 	// Compile every instruction in the shader
 	// This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing
 	recompilerPC = 0;
@@ -73,17 +76,23 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	compileUntil(shaderUnit, PICAShader::maxInstructionCount);
 }
 
-void ShaderEmitter::scanForCalls(const PICAShader& shaderUnit) {
+void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
 	returnPCs.clear();
 
 	for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) {
 		const u32 instruction = shaderUnit.loadedShader[i];
+		const u32 opcode = instruction >> 26;
+
 		if (isCall(instruction)) {
 			const u32 num = instruction & 0xff;
 			const u32 dest = getBits<10, 12>(instruction);
 			const u32 returnPC = num + dest; // Add them to get the return PC
 
 			returnPCs.push_back(returnPC);
+		} else if (opcode == ShaderOpcodes::EX2) {
+			codeHasExp2 = true;
+		} else if (opcode == ShaderOpcodes::LG2) {
+			codeHasLog2 = true;
 		}
 	}
 
@@ -877,7 +886,6 @@ void ShaderEmitter::recLOOP(const PICAShader& shader, u32 instruction) {
 	loopLevel--;
 }
 
-// SSE does not have a log2 instruction so we temporarily emulate this using x87 FPU
 void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src = getBits<12, 7>(instruction);
@@ -885,30 +893,16 @@ void ShaderEmitter::recLG2(const PICAShader& shader, u32 instruction) {
 	const u32 dest = getBits<21, 5>(instruction);
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
-	// Load swizzled source, push 1.0 to the x87 stack
 	loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor);
-	fld1();
-
-	// Push source to the x87 stack
-	movd(eax, src1_xmm);
-	push(rax);
-	fld(dword[rsp]);
-
-	// Perform log2, load result to src1_xmm, write it back and undo the previous push rax
-	fyl2x();
-	fstp(dword[rsp]);
-	movss(src1_xmm, dword[rsp]);
-	add(rsp, 8);
-
-	// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
-	// Otherwise we do
+	call(log2Func); // Result is output in src1_xmm
+	
 	if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 		shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
-// SSE does not have an exp2 instruction so we temporarily emulate this using x87 FPU
 void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src = getBits<12, 7>(instruction);
@@ -917,31 +911,12 @@ void ShaderEmitter::recEX2(const PICAShader& shader, u32 instruction) {
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
 	loadRegister<1>(src1_xmm, shader, src, idx, operandDescriptor);
+	call(exp2Func);  // Result is output in src1_xmm
 
-	// Push source to the x87 stack, then do some insane compiler-generated x87 math
-	movd(eax, src1_xmm);
-	push(rax);
-	fld(dword[rsp]);
-
-	fld(st0);
-	frndint();
-	fsub(st1, st0);
-	fxch(st1);
-	f2xm1();
-	fadd(dword[rip + onesVector]);
-	fscale();
-
-	// Load result to src1_xmm, write it back and undo the previous push rax
-	fstp(st1);
-	fstp(dword[rsp]);
-	movss(src1_xmm, dword[rsp]);
-	add(rsp, 8);
-
-	// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
-	// Otherwise we do
 	if (writeMask != 0x8) {             // Copy bottom lane to all lanes if we're not simply writing back x
 		shufps(src1_xmm, src1_xmm, 0);  // src1_xmm = src1_xmm.xxxx
 	}
+
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
@@ -962,6 +937,228 @@ void ShaderEmitter::printLog(const PICAShader& shaderUnit) {
 	printf("cmp: (%d, %d)\n", shaderUnit.cmpRegister[0], shaderUnit.cmpRegister[1]);
 }
 
+// For EXP2/LOG2, we have permission to adjust and relicense the SSE implementation from Citra for this project from the original authors
+// So we do it since EXP2/LOG2 are pretty terrible to implement.
+// ABI: Input is in the bottom bits of src1_xmm, same for output. If the result needs swizzling, the caller must handle it
+// Assume src1, src2, scratch1, scratch2, eax, edx all thrashed
+
+Xbyak::Label ShaderEmitter::emitLog2Func() {
+	Xbyak::Label subroutine;
+
+	// This code uses the fact that log2(float) = log2(2^exponent * mantissa)
+	// = log2(2^exponent) + log2(mantissa) = exponent + log2(mantissa) where mantissa has a limited range of values
+	// https://stackoverflow.com/a/45787548
+
+	// SSE does not have a log instruction, thus we must approximate.
+	// We perform this approximation first performing a range reduction into the range [1.0, 2.0).
+	// A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated.
+	// We multiply the result by (x - 1) then restore the result into the appropriate range.
+
+	// Coefficients for the minimax polynomial.
+	// f(x) computes approximately log2(x) / (x - 1).
+	// f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
+	// We align the table of coefficients to 64 bytes, so that the whole thing will fit in 1 cache line
+	align(64);
+	const void* c0 = getCurr();
+	dd(0x3d74552f);
+	const void* c1 = getCurr();
+	dd(0xbeee7397);
+	const void* c2 = getCurr();
+	dd(0x3fbd96dd);
+	const void* c3 = getCurr();
+	dd(0xc02153f6);
+	const void* c4 = getCurr();
+	dd(0x4038d96c);
+
+	align(16);
+	const void* negative_infinity_vector = getCurr();
+	dd(0xff800000);
+	dd(0xff800000);
+	dd(0xff800000);
+	dd(0xff800000);
+	const void* default_qnan_vector = getCurr();
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+	dd(0x7fc00000);
+
+	Xbyak::Label inputIsNan, inputIsZero, inputOutOfRange;
+
+	align(16);
+	L(inputOutOfRange);
+	je(inputIsZero);
+	movaps(src1_xmm, xword[rip + default_qnan_vector]);
+	ret();
+	L(inputIsZero);
+	movaps(src1_xmm, xword[rip + negative_infinity_vector]);
+	ret();
+
+	align(16);
+	L(subroutine);
+
+	// Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
+	xorps(scratch1, scratch1);
+	ucomiss(scratch1, src1_xmm);
+	jp(inputIsNan);
+	jae(inputOutOfRange);
+
+	// Split input: SRC1=MANT[1,2) SCRATCH2=Exponent
+	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
+		vgetexpss(scratch2, src1_xmm, src1_xmm);
+		vgetmantss(src1_xmm, src1_xmm, src1_xmm, 0);
+	} else {
+		movd(eax, src1_xmm);
+		mov(edx, eax);
+		and_(eax, 0x7f800000);
+		and_(edx, 0x007fffff);
+		or_(edx, 0x3f800000);
+		movd(src1_xmm, edx);
+		// SRC1 now contains the mantissa of the input.
+		shr(eax, 23);
+		sub(eax, 0x7f);
+		cvtsi2ss(scratch2, eax);
+		// scratch2 now contains the exponent of the input.
+	}
+
+	movss(scratch1, xword[rip + c0]);
+
+	// Complete computation of polynomial
+	if (haveFMA3) {
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c1]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c2]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c3]);
+		vfmadd213ss(scratch1, src1_xmm, xword[rip + c4]);
+		subss(src1_xmm, dword[rip + onesVector]);
+		vfmadd231ss(scratch2, scratch1, src1_xmm);
+	} else {
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c1]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c2]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch1, xword[rip + c3]);
+		mulss(scratch1, src1_xmm);
+		subss(src1_xmm, dword[rip + onesVector]);
+		addss(scratch1, xword[rip + c4]);
+		mulss(scratch1, src1_xmm);
+		addss(scratch2, scratch1);
+	}
+
+	xorps(src1_xmm, src1_xmm);  // break dependency chain
+	movss(src1_xmm, scratch2);
+	L(inputIsNan);
+
+	ret();
+	return subroutine;
+}
+
+Xbyak::Label ShaderEmitter::emitExp2Func() {
+	Xbyak::Label subroutine;
+
+	// SSE does not have a exp instruction, thus we must approximate.
+	// We perform this approximation first performaing a range reduction into the range [-0.5, 0.5).
+	// A minimax polynomial which was fit for the function exp2(x) is then evaluated.
+	// We then restore the result into the appropriate range.
+
+	// Similarly to log2, we align our literal pool to 64 bytes to make sure the whole thing fits in 1 cache line
+	align(64);
+	const void* input_max = getCurr();
+	dd(0x43010000);
+	const void* input_min = getCurr();
+	dd(0xc2fdffff);
+	const void* c0 = getCurr();
+	dd(0x3c5dbe69);
+	const void* half = getCurr();
+	dd(0x3f000000);
+	const void* c1 = getCurr();
+	dd(0x3d5509f9);
+	const void* c2 = getCurr();
+	dd(0x3e773cc5);
+	const void* c3 = getCurr();
+	dd(0x3f3168b3);
+	const void* c4 = getCurr();
+	dd(0x3f800016);
+
+	Xbyak::Label retLabel;
+
+	align(16);
+	L(subroutine);
+
+	// Handle edge cases
+	ucomiss(src1_xmm, src1_xmm);
+	jp(retLabel);
+
+	// Decompose input:
+	// SCRATCH=2^round(input)
+	// SRC1=input-round(input) [-0.5, 0.5)
+	if (cpuCaps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) {
+		// Cheat a bit and store ones in src2 since the register is unused
+		vmovaps(src2_xmm, xword[rip + onesVector]);
+		// input - 0.5
+		vsubss(scratch1, src1_xmm, xword[rip + half]);
+
+		// trunc(input - 0.5)
+		vrndscaless(scratch2, scratch1, scratch1, _MM_FROUND_TRUNC);
+
+		// SCRATCH = 1 * 2^(trunc(input - 0.5))
+		vscalefss(scratch1, src2_xmm, scratch2);
+
+		// SRC1 = input-trunc(input - 0.5)
+		vsubss(src1_xmm, src1_xmm, scratch2);
+	} else {
+		// Clamp to maximum range since we shift the value directly into the exponent.
+		minss(src1_xmm, xword[rip + input_max]);
+		maxss(src1_xmm, xword[rip + input_min]);
+
+		if (cpuCaps.has(Cpu::tAVX)) {
+			vsubss(scratch1, src1_xmm, xword[rip + half]);
+		} else {
+			movss(scratch1, src1_xmm);
+			subss(scratch1, xword[rip + half]);
+		}
+
+		if (cpuCaps.has(Cpu::tSSE41)) {
+			roundss(scratch1, scratch1, _MM_FROUND_TRUNC);
+			cvtss2si(eax, scratch1);
+		} else {
+			cvtss2si(eax, scratch1);
+			cvtsi2ss(scratch1, eax);
+		}
+		// SCRATCH now contains input rounded to the nearest integer.
+		add(eax, 0x7f);
+		subss(src1_xmm, scratch1);
+		// SRC1 contains input - round(input), which is in [-0.5, 0.5).
+		shl(eax, 23);
+		movd(scratch1, eax);
+		// SCRATCH contains 2^(round(input)).
+	}
+
+	// Complete computation of polynomial.
+	movss(scratch2, xword[rip + c0]);
+
+	if (haveFMA3) {
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c1]);
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c2]);
+		vfmadd213ss(scratch2, src1_xmm, xword[rip + c3]);
+		vfmadd213ss(src1_xmm, scratch2, xword[rip + c4]);
+	} else {
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c1]);
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c2]);
+		mulss(scratch2, src1_xmm);
+		addss(scratch2, xword[rip + c3]);
+		mulss(src1_xmm, scratch2);
+		addss(src1_xmm, xword[rip + c4]);
+	}
+
+	mulss(src1_xmm, scratch1);
+	L(retLabel);
+
+	ret();
+	return subroutine;
+}
+
 // As we mentioned above, this function is uber slow because we don't expect the shader JIT to call HLL functions in real scenarios
 // Aside from debugging code. So we don't care for this function to be performant or anything of the like.  It is quick and dirty
 // And mostly meant to be used for generating logs to diff the JIT and interpreter
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 37b67a50..d75b0ae5 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -2,19 +2,45 @@
 
 #include <array>
 #include <bitset>
-#include <cstdio>
 #include <cstddef>
+#include <cstdio>
 
 #include "PICA/float_types.hpp"
 #include "PICA/regs.hpp"
+#include "renderer_null/renderer_null.hpp"
+#ifdef PANDA3DS_ENABLE_OPENGL
+#include "renderer_gl/renderer_gl.hpp"
+#endif
 
 using namespace Floats;
 
 // Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it
 // Thus, our GLStateManager being here does not negatively impact renderer-agnosticness
-GPU::GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config) : mem(mem), renderer(*this, gl, regs), config(config) {
+GPU::GPU(Memory& mem, EmulatorConfig& config) : mem(mem), config(config) {
 	vram = new u8[vramSize];
-	mem.setVRAM(vram); // Give the bus a pointer to our VRAM
+	mem.setVRAM(vram);  // Give the bus a pointer to our VRAM
+
+	switch (config.rendererType) {
+		case RendererType::Null: {
+			renderer.reset(new RendererNull(*this, regs));
+			break;
+		}
+#ifdef PANDA3DS_ENABLE_OPENGL
+		case RendererType::OpenGL: {
+			renderer.reset(new RendererGL(*this, regs));
+			break;
+		}
+#endif
+
+		case RendererType::Vulkan: {
+			Helpers::panic("Vulkan is not supported yet, please pick another renderer");
+		}
+
+		default: {
+			Helpers::panic("Rendering backend not supported: %s", Renderer::typeToString(config.rendererType));
+			break;
+		}
+	}
 }
 
 void GPU::reset() {
@@ -41,7 +67,7 @@ void GPU::reset() {
 		e.config2 = 0;
 	}
 
-	renderer.reset();
+	renderer->reset();
 }
 
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
@@ -73,15 +99,14 @@ void GPU::drawArrays() {
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
-	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
 	// Configures the type of primitive and the number of vertex shader outputs
 	const u32 primConfig = regs[PICA::InternalRegs::PrimitiveConfig];
 	const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(primConfig));
 	if (vertexCount > Renderer::vertexBufferSize) Helpers::panic("[PICA] vertexCount > vertexBufferSize");
 
-	if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) ||
-		(primType == PICA::PrimType::TriangleStrip && vertexCount < 3) ||
+	if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) || (primType == PICA::PrimType::TriangleStrip && vertexCount < 3) ||
 		(primType == PICA::PrimType::TriangleFan && vertexCount < 3)) {
 		Helpers::panic("Invalid vertex count for primitive. Type: %d, vert count: %d\n", primType, vertexCount);
 	}
@@ -89,10 +114,10 @@ void GPU::drawArrays() {
 	// Get the configuration for the index buffer, used only for indexed drawing
 	u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
 	u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
-	bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit
+	bool shortIndex = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
 
 	// Stuff the global attribute config registers in one u64 to make attr parsing easier
-	// TODO: Cache this when the vertex attribute format registers are written to 
+	// TODO: Cache this when the vertex attribute format registers are written to
 	u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
 
 	if constexpr (!indexed) {
@@ -111,24 +136,24 @@ void GPU::drawArrays() {
 	constexpr size_t vertexCacheSize = 64;
 
 	struct {
-		std::bitset<vertexCacheSize> validBits{0};           // Shows which tags are valid. If the corresponding bit is 1, then there's an entry
-		std::array<u32, vertexCacheSize> ids;                // IDs (ie indices of the cached vertices in the 3DS vertex buffer)
-		std::array<u32, vertexCacheSize> bufferPositions;    // Positions of the cached vertices in our own vertex buffer
+		std::bitset<vertexCacheSize> validBits{0};         // Shows which tags are valid. If the corresponding bit is 1, then there's an entry
+		std::array<u32, vertexCacheSize> ids;              // IDs (ie indices of the cached vertices in the 3DS vertex buffer)
+		std::array<u32, vertexCacheSize> bufferPositions;  // Positions of the cached vertices in our own vertex buffer
 	} vertexCache;
-		
+
 	for (u32 i = 0; i < vertexCount; i++) {
-		u32 vertexIndex; // Index of the vertex in the VBO for indexed rendering
+		u32 vertexIndex;  // Index of the vertex in the VBO for indexed rendering
 
 		if constexpr (!indexed) {
 			vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg];
 		} else {
 			if (shortIndex) {
 				auto ptr = getPointerPhys<u16>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is very unsafe
+				vertexIndex = *ptr;  // TODO: This is very unsafe
 				indexBufferPointer += 2;
 			} else {
 				auto ptr = getPointerPhys<u8>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is also very unsafe
+				vertexIndex = *ptr;  // TODO: This is also very unsafe
 				indexBufferPointer += 1;
 			}
 		}
@@ -152,22 +177,22 @@ void GPU::drawArrays() {
 		}
 
 		int attrCount = 0;
-		int buffer = 0; // Vertex buffer index for non-fixed attributes
+		int buffer = 0;  // Vertex buffer index for non-fixed attributes
 
 		while (attrCount < totalAttribCount) {
 			// Check if attribute is fixed or not
-			if (fixedAttribMask & (1 << attrCount)) { // Fixed attribute
-				vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; // TODO: Is this how it works?
+			if (fixedAttribMask & (1 << attrCount)) {                         // Fixed attribute
+				vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];  // TODO: Is this how it works?
 				vec4f& inputAttr = currentAttributes[attrCount];
-				std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f)); // Copy fixed attr to input attr
+				std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f));  // Copy fixed attr to input attr
 				attrCount++;
-			} else { // Non-fixed attribute
-				auto& attr = attributeInfo[buffer]; // Get information for this attribute
-				u64 attrCfg = attr.getConfigFull(); // Get config1 | (config2 << 32)
+			} else {                                 // Non-fixed attribute
+				auto& attr = attributeInfo[buffer];  // Get information for this attribute
+				u64 attrCfg = attr.getConfigFull();  // Get config1 | (config2 << 32)
 				u32 attrAddress = vertexBase + attr.offset + (vertexIndex * attr.size);
 
 				for (int j = 0; j < attr.componentCount; j++) {
-					uint index = (attrCfg >> (j * 4)) & 0xf; // Get index of attribute in vertexCfg
+					uint index = (attrCfg >> (j * 4)) & 0xf;  // Get index of attribute in vertexCfg
 
 					// Vertex attributes used as padding
 					// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
@@ -179,15 +204,15 @@ void GPU::drawArrays() {
 					}
 
 					u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
-					u32 attribType = attribInfo & 0x3; //  Type of attribute(sbyte/ubyte/short/float)
-					u32 size = (attribInfo >> 2) + 1; // Total number of components
+					u32 attribType = attribInfo & 0x3;  //  Type of attribute(sbyte/ubyte/short/float)
+					u32 size = (attribInfo >> 2) + 1;   // Total number of components
 
-					//printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size);
+					// printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size);
 					vec4f& attribute = currentAttributes[attrCount];
-					uint component; // Current component
+					uint component;  // Current component
 
 					switch (attribType) {
-						case 0: { // Signed byte
+						case 0: {  // Signed byte
 							s8* ptr = getPointerPhys<s8>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@@ -197,7 +222,7 @@ void GPU::drawArrays() {
 							break;
 						}
 
-						case 1: { // Unsigned byte
+						case 1: {  // Unsigned byte
 							u8* ptr = getPointerPhys<u8>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@@ -207,7 +232,7 @@ void GPU::drawArrays() {
 							break;
 						}
 
-						case 2: { // Short
+						case 2: {  // Short
 							s16* ptr = getPointerPhys<s16>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@@ -217,7 +242,7 @@ void GPU::drawArrays() {
 							break;
 						}
 
-						case 3: { // Float
+						case 3: {  // Float
 							float* ptr = getPointerPhys<float>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = *ptr++;
@@ -251,8 +276,8 @@ void GPU::drawArrays() {
 			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}
-		
-        if constexpr (useShaderJIT) {
+
+		if constexpr (useShaderJIT) {
 			shaderJIT.run(shaderUnit.vs);
 		} else {
 			shaderUnit.vs.run();
@@ -264,14 +289,14 @@ void GPU::drawArrays() {
 		for (int i = 0; i < totalShaderOutputs; i++) {
 			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
 
-			for (int j = 0; j < 4; j++) { // pls unroll
+			for (int j = 0; j < 4; j++) {  // pls unroll
 				const u32 mapping = (config >> (j * 8)) & 0x1F;
 				out.raw[mapping] = shaderUnit.vs.outputs[i][j];
 			}
 		}
 	}
 
-	renderer.drawVertices(primType, std::span(vertices).first(vertexCount));
+	renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 }
 
 PICA::Vertex GPU::getImmediateModeVertex() {
@@ -289,7 +314,9 @@ PICA::Vertex GPU::getImmediateModeVertex() {
 	std::memcpy(&v.s.colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
 	std::memcpy(&v.s.texcoord0, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));
 
-	printf("(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]);
+	printf(
+		"(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]
+	);
 	printf("(r, g, b, a) = (%f, %f, %f, %f)\n", (double)v.s.colour[0], (double)v.s.colour[1], (double)v.s.colour[2], (double)v.s.colour[3]);
 	printf("(u, v      ) = (%f, %f)\n", (double)v.s.texcoord0[0], (double)v.s.texcoord0[1]);
 
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index f62040dd..d245f8af 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -1,11 +1,12 @@
-#include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
 
+#include "PICA/gpu.hpp"
+
 using namespace Floats;
 using namespace Helpers;
 
 u32 GPU::readReg(u32 address) {
-	if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
+	if (address >= 0x1EF01000 && address < 0x1EF01C00) {  // Internal registers
 		const u32 index = (address - 0x1EF01000) / sizeof(u32);
 		return readInternalReg(index);
 	} else {
@@ -15,7 +16,7 @@ u32 GPU::readReg(u32 address) {
 }
 
 void GPU::writeReg(u32 address, u32 value) {
-	if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
+	if (address >= 0x1EF01000 && address < 0x1EF01C00) {  // Internal registers
 		const u32 index = (address - 0x1EF01000) / sizeof(u32);
 		writeInternalReg(index, value, 0xffffffff);
 	} else {
@@ -59,7 +60,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 	}
 
 	u32 currentValue = regs[index];
-	u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask"
+	u32 newValue = (currentValue & ~mask) | (value & mask);  // Only overwrite the bits specified by "mask"
 	regs[index] = newValue;
 
 	// TODO: Figure out if things like the shader index use the unmasked value or the masked one
@@ -74,38 +75,38 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			break;
 
 		case AttribFormatHigh:
-			totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes
-			fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices
+			totalAttribCount = (value >> 28) + 1;      // Total number of vertex attributes
+			fixedAttribMask = getBits<16, 12>(value);  // Determines which vertex attributes are fixed for all vertices
 			break;
 
 		case ColourBufferLoc: {
 			u32 loc = (value & 0x0fffffff) << 3;
-			renderer.setColourBufferLoc(loc);
+			renderer->setColourBufferLoc(loc);
 			break;
 		};
 
 		case ColourBufferFormat: {
 			u32 format = getBits<16, 3>(value);
-			renderer.setColourFormat(static_cast<PICA::ColorFmt>(format));
+			renderer->setColourFormat(static_cast<PICA::ColorFmt>(format));
 			break;
 		}
 
 		case DepthBufferLoc: {
 			u32 loc = (value & 0x0fffffff) << 3;
-			renderer.setDepthBufferLoc(loc);
+			renderer->setDepthBufferLoc(loc);
 			break;
 		}
 
 		case DepthBufferFormat: {
 			u32 format = value & 0x3;
-			renderer.setDepthFormat(static_cast<PICA::DepthFmt>(format));
+			renderer->setDepthFormat(static_cast<PICA::DepthFmt>(format));
 			break;
 		}
 
 		case FramebufferSize: {
 			const u32 width = value & 0x7ff;
 			const u32 height = getBits<12, 10>(value) + 1;
-			renderer.setFBSize(width, height);
+			renderer->setFBSize(width, height);
 			break;
 		}
 
@@ -116,7 +117,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		case LightingLUTData4:
 		case LightingLUTData5:
 		case LightingLUTData6:
-		case LightingLUTData7:{
+		case LightingLUTData7: {
 			const uint32_t index = regs[LightingLUTIndex];  // Get full LUT index register
 			const uint32_t lutID = getBits<8, 5>(index);    // Get which LUT we're actually writing to
 			uint32_t lutIndex = getBits<0, 8>(index);       // And get the index inside the LUT we're writing to
@@ -133,15 +134,22 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			break;
 		}
 
-		case VertexFloatUniformIndex:
+		case VertexFloatUniformIndex: {
 			shaderUnit.vs.setFloatUniformIndex(value);
 			break;
+		}
 
-		case VertexFloatUniformData0: case VertexFloatUniformData1: case VertexFloatUniformData2:
-		case VertexFloatUniformData3: case VertexFloatUniformData4: case VertexFloatUniformData5:
-		case VertexFloatUniformData6: case VertexFloatUniformData7:
+		case VertexFloatUniformData0:
+		case VertexFloatUniformData1:
+		case VertexFloatUniformData2:
+		case VertexFloatUniformData3:
+		case VertexFloatUniformData4:
+		case VertexFloatUniformData5:
+		case VertexFloatUniformData6:
+		case VertexFloatUniformData7: {
 			shaderUnit.vs.uploadFloatUniform(value);
 			break;
+		}
 
 		case FixedAttribIndex:
 			fixedAttribCount = 0;
@@ -162,7 +170,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			}
 			break;
 
-		case FixedAttribData0: case FixedAttribData1: case FixedAttribData2:
+		case FixedAttribData0:
+		case FixedAttribData1:
+		case FixedAttribData2:
 			fixedAttrBuff[fixedAttribCount++] = value;
 
 			if (fixedAttribCount == 3) {
@@ -170,15 +180,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 
 				vec4f attr;
 				// These are stored in the reverse order anyone would expect them to be in
-				attr.x() = f24::fromRaw(fixedAttrBuff[2] & 0xffffff);
-				attr.y() = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24));
-				attr.z() = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16));
-				attr.w() = f24::fromRaw(fixedAttrBuff[0] >> 8);
+				attr[0] = f24::fromRaw(fixedAttrBuff[2] & 0xffffff);
+				attr[1] = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24));
+				attr[2] = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16));
+				attr[3] = f24::fromRaw(fixedAttrBuff[0] >> 8);
 
 				// If the fixed attribute index is < 12, we're just writing to one of the fixed attributes
 				if (fixedAttribIndex < 12) [[likely]] {
 					shaderUnit.vs.fixedAttributes[fixedAttribIndex++] = attr;
-				} else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex
+				} else if (fixedAttribIndex == 15) {  // Otherwise if it's 15, we're submitting an immediate mode vertex
 					const uint totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1;
 					if (totalAttrCount <= immediateModeAttrIndex) {
 						printf("Broken state in the immediate mode vertex submission pipeline. Failing silently\n");
@@ -199,13 +209,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
-							renderer.drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
+							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
 
 							switch (primType) {
 								// Triangle or geometry primitive. Draw a triangle and discard all vertices
-								case 0: case 3:
+								case 0:
+								case 3: {
 									immediateModeVertIndex = 0;
 									break;
+								}
 
 								// Triangle strip. Draw triangle, discard first vertex and keep the last 2
 								case 1:
@@ -223,54 +235,72 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 							}
 						}
 					}
-				} else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see
+				} else {  // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see
 					log("Wrote to invalid fixed vertex attribute %d\n", fixedAttribIndex);
 				}
 			}
 
 			break;
 
-		case VertexShaderOpDescriptorIndex:
+		case VertexShaderOpDescriptorIndex: {
 			shaderUnit.vs.setOpDescriptorIndex(value);
 			break;
+		}
 
-		case VertexShaderOpDescriptorData0: case VertexShaderOpDescriptorData1: case VertexShaderOpDescriptorData2:
-		case VertexShaderOpDescriptorData3: case VertexShaderOpDescriptorData4: case VertexShaderOpDescriptorData5:
-		case VertexShaderOpDescriptorData6: case VertexShaderOpDescriptorData7:
+		case VertexShaderOpDescriptorData0:
+		case VertexShaderOpDescriptorData1:
+		case VertexShaderOpDescriptorData2:
+		case VertexShaderOpDescriptorData3:
+		case VertexShaderOpDescriptorData4:
+		case VertexShaderOpDescriptorData5:
+		case VertexShaderOpDescriptorData6:
+		case VertexShaderOpDescriptorData7: {
 			shaderUnit.vs.uploadDescriptor(value);
 			break;
+		}
 
-		case VertexBoolUniform:
+		case VertexBoolUniform: {
 			shaderUnit.vs.boolUniform = value & 0xffff;
 			break;
+		}
 
-		case VertexIntUniform0: case VertexIntUniform1: case VertexIntUniform2: case VertexIntUniform3:
+		case VertexIntUniform0:
+		case VertexIntUniform1:
+		case VertexIntUniform2:
+		case VertexIntUniform3: {
 			shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value);
 			break;
+		}
 
-		case VertexShaderData0: case VertexShaderData1: case VertexShaderData2: case VertexShaderData3:
-		case VertexShaderData4: case VertexShaderData5: case VertexShaderData6: case VertexShaderData7:
+		case VertexShaderData0:
+		case VertexShaderData1:
+		case VertexShaderData2:
+		case VertexShaderData3:
+		case VertexShaderData4:
+		case VertexShaderData5:
+		case VertexShaderData6:
+		case VertexShaderData7: {
 			shaderUnit.vs.uploadWord(value);
 			break;
+		}
 
-		case VertexShaderEntrypoint:
+		case VertexShaderEntrypoint: {
 			shaderUnit.vs.entrypoint = value & 0xffff;
 			break;
+		}
 
 		case VertexShaderTransferEnd:
 			if (value != 0) shaderUnit.vs.finalize();
 			break;
 
-		case VertexShaderTransferIndex:
-			shaderUnit.vs.setBufferIndex(value);
-			break;
+		case VertexShaderTransferIndex: shaderUnit.vs.setBufferIndex(value); break;
 
 		// Command lists can write to the command processor registers and change the command list stream
 		// Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land
 		case CmdBufTrigger0:
 		case CmdBufTrigger1: {
-			if (value != 0) { // A non-zero value triggers command list processing
-				int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1)
+			if (value != 0) {                              // A non-zero value triggers command list processing
+				int bufferIndex = index - CmdBufTrigger0;  // Index of the command buffer to execute (0 or 1)
 				u32 addr = (regs[CmdBufAddr0 + bufferIndex] & 0xfffffff) << 3;
 				u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3;
 
@@ -285,15 +315,13 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		default:
 			// Vertex attribute registers
 			if (index >= AttribInfoStart && index <= AttribInfoEnd) {
-				uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to
-				uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to?
+				uint attributeIndex = (index - AttribInfoStart) / 3;  // Which attribute are we writing to
+				uint reg = (index - AttribInfoStart) % 3;             // Which of this attribute's registers are we writing to?
 				auto& attr = attributeInfo[attributeIndex];
 
 				switch (reg) {
-					case 0: attr.offset = value & 0xfffffff; break; // Attribute offset
-					case 1: 
-						attr.config1 = value;
-						break;
+					case 0: attr.offset = value & 0xfffffff; break;  // Attribute offset
+					case 1: attr.config1 = value; break;
 					case 2:
 						attr.config2 = value;
 						attr.size = getBits<16, 8>(value);
@@ -339,13 +367,13 @@ void GPU::startCommandList(u32 addr, u32 size) {
 
 		u32 id = header & 0xffff;
 		u32 paramMaskIndex = getBits<16, 4>(header);
-		u32 paramCount = getBits<20, 8>(header); // Number of additional parameters
+		u32 paramCount = getBits<20, 8>(header);  // Number of additional parameters
 		// Bit 31 tells us whether this command is going to write to multiple sequential registers (if the bit is 1)
 		// Or if all written values will go to the same register (If the bit is 0). It's essentially the value that
 		// gets added to the "id" field after each register write
 		bool consecutiveWritingMode = (header >> 31) != 0;
 
-		u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask
+		u32 mask = maskLUT[paramMaskIndex];  // Actual parameter mask
 		// Increment the ID by 1 after each write if we're in consecutive mode, or 0 otherwise
 		u32 idIncrement = (consecutiveWritingMode) ? 1 : 0;
 
diff --git a/src/core/PICA/shader_interpreter.cpp b/src/core/PICA/shader_interpreter.cpp
index 7af284e3..9fed6bba 100644
--- a/src/core/PICA/shader_interpreter.cpp
+++ b/src/core/PICA/shader_interpreter.cpp
@@ -1,6 +1,7 @@
-#include "PICA/shader.hpp"
 #include <cmath>
 
+#include "PICA/shader.hpp"
+
 using namespace Helpers;
 
 void PICAShader::run() {
@@ -11,20 +12,23 @@ void PICAShader::run() {
 
 	while (true) {
 		const u32 instruction = loadedShader[pc++];
-		const u32 opcode = instruction >> 26; // Top 6 bits are the opcode
+		const u32 opcode = instruction >> 26;  // Top 6 bits are the opcode
 
 		switch (opcode) {
 			case ShaderOpcodes::ADD: add(instruction); break;
 			case ShaderOpcodes::CALL: call(instruction); break;
 			case ShaderOpcodes::CALLC: callc(instruction); break;
 			case ShaderOpcodes::CALLU: callu(instruction); break;
-			case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: 
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
 				cmp(instruction);
 				break;
+			}
+
 			case ShaderOpcodes::DP3: dp3(instruction); break;
 			case ShaderOpcodes::DP4: dp4(instruction); break;
 			case ShaderOpcodes::DPHI: dphi(instruction); break;
-			case ShaderOpcodes::END: return; // Stop running shader
+			case ShaderOpcodes::END: return;  // Stop running shader
 			case ShaderOpcodes::EX2: ex2(instruction); break;
 			case ShaderOpcodes::FLR: flr(instruction); break;
 			case ShaderOpcodes::IFC: ifc(instruction); break;
@@ -38,31 +42,47 @@ void PICAShader::run() {
 			case ShaderOpcodes::MOV: mov(instruction); break;
 			case ShaderOpcodes::MOVA: mova(instruction); break;
 			case ShaderOpcodes::MUL: mul(instruction); break;
-			case ShaderOpcodes::NOP: break; // Do nothing
+			case ShaderOpcodes::NOP: break;  // Do nothing
 			case ShaderOpcodes::RCP: rcp(instruction); break;
 			case ShaderOpcodes::RSQ: rsq(instruction); break;
 			case ShaderOpcodes::SGEI: sgei(instruction); break;
 			case ShaderOpcodes::SLT: slt(instruction); break;
 			case ShaderOpcodes::SLTI: slti(instruction); break;
 
-			case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
+			case 0x30:
+			case 0x31:
+			case 0x32:
+			case 0x33:
+			case 0x34:
+			case 0x35:
+			case 0x36:
+			case 0x37: {
 				madi(instruction);
 				break;
+			}
 
-			case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F:
+			case 0x38:
+			case 0x39:
+			case 0x3A:
+			case 0x3B:
+			case 0x3C:
+			case 0x3D:
+			case 0x3E:
+			case 0x3F: {
 				mad(instruction);
 				break;
+			}
 
-			default:Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode);
+			default: Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode);
 		}
 
 		// Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL
 		// Handle loop
 		if (loopIndex != 0) {
 			auto& loop = loopInfo[loopIndex - 1];
-			if (pc == loop.endingPC) { // Check if the loop needs to start over
+			if (pc == loop.endingPC) {  // Check if the loop needs to start over
 				loop.iterations -= 1;
-				if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack
+				if (loop.iterations == 0)  // If the loop ended, go one level down on the loop stack
 					loopIndex -= 1;
 
 				loopCounter += loop.increment;
@@ -73,7 +93,7 @@ void PICAShader::run() {
 		// Handle ifs
 		if (ifIndex != 0) {
 			auto& info = conditionalInfo[ifIndex - 1];
-			if (pc == info.endingPC) { // Check if the IF block ended
+			if (pc == info.endingPC) {  // Check if the IF block ended
 				pc = info.newPC;
 				ifIndex -= 1;
 			}
@@ -82,7 +102,7 @@ void PICAShader::run() {
 		// Handle calls
 		if (callIndex != 0) {
 			auto& info = callInfo[callIndex - 1];
-			if (pc == info.endingPC) { // Check if the CALL block ended
+			if (pc == info.endingPC) {  // Check if the CALL block ended
 				pc = info.returnPC;
 				callIndex -= 1;
 			}
@@ -92,15 +112,15 @@ void PICAShader::run() {
 
 // Calculate the actual source value using an instruction's source field and it's respective index value
 // The index value is used to apply relative addressing when index != 0 by adding one of the 3 addr registers to the
-// source field, but only with the original source field is pointing at a vector uniform register 
+// source field, but only with the original source field is pointing at a vector uniform register
 u8 PICAShader::getIndexedSource(u32 source, u32 index) {
-	if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg
+	if (source < 0x20)  // No offset is applied if the source isn't pointing to a vector uniform reg
 		return source;
 
 	switch (index) {
-		case 0: [[likely]] return u8(source); // No offset applied
-		case 1: return u8(source + addrRegister.x());
-		case 2: return u8(source + addrRegister.y());
+		case 0: [[likely]] return u8(source);  // No offset applied
+		case 1: return u8(source + addrRegister[0]);
+		case 2: return u8(source + addrRegister[1]);
 		case 3: return u8(source + loopCounter);
 	}
 
@@ -117,7 +137,7 @@ PICAShader::vec4f PICAShader::getSource(u32 source) {
 		return floatUniforms[source - 0x20];
 	else {
 		Helpers::warn("[PICA] Unimplemented source value: %X\n", source);
-		return vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
+		return vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
 	}
 }
 
@@ -136,13 +156,13 @@ bool PICAShader::isCondTrue(u32 instruction) {
 	bool refX = (getBit<25>(instruction)) != 0;
 
 	switch (condition) {
-		case 0: // Either cmp register matches 
+		case 0:  // Either cmp register matches
 			return cmpRegister[0] == refX || cmpRegister[1] == refY;
-		case 1: // Both cmp registers match
+		case 1:  // Both cmp registers match
 			return cmpRegister[0] == refX && cmpRegister[1] == refY;
-		case 2: // At least cmp.x matches
+		case 2:  // At least cmp.x matches
 			return cmpRegister[0] == refX;
-		default: // At least cmp.y matches
+		default:  // At least cmp.y matches
 			return cmpRegister[1] == refY;
 	}
 }
@@ -150,7 +170,7 @@ bool PICAShader::isCondTrue(u32 instruction) {
 void PICAShader::add(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -171,7 +191,7 @@ void PICAShader::add(u32 instruction) {
 void PICAShader::mul(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -210,7 +230,7 @@ void PICAShader::flr(u32 instruction) {
 void PICAShader::max(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -232,7 +252,7 @@ void PICAShader::max(u32 instruction) {
 void PICAShader::min(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -278,16 +298,16 @@ void PICAShader::mova(u32 instruction) {
 	vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor);
 
 	u32 componentMask = operandDescriptor & 0xf;
-	if (componentMask & 0b1000) // x component
-		addrRegister.x() = static_cast<s32>(srcVector.x().toFloat32());
-	if (componentMask & 0b0100) // y component
-		addrRegister.y() = static_cast<s32>(srcVector.y().toFloat32());
+	if (componentMask & 0b1000)  // x component
+		addrRegister[0] = static_cast<s32>(srcVector[0].toFloat32());
+	if (componentMask & 0b0100)  // y component
+		addrRegister[1] = static_cast<s32>(srcVector[1].toFloat32());
 }
 
 void PICAShader::dp3(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -309,7 +329,7 @@ void PICAShader::dp3(u32 instruction) {
 void PICAShader::dp4(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -480,7 +500,7 @@ void PICAShader::madi(u32 instruction) {
 void PICAShader::slt(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
@@ -542,11 +562,11 @@ void PICAShader::slti(u32 instruction) {
 void PICAShader::cmp(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 cmpY = getBits<21, 3>(instruction);
 	const u32 cmpX = getBits<24, 3>(instruction);
-	const u32 cmpOperations[2] = { cmpX, cmpY };
+	const u32 cmpOperations[2] = {cmpX, cmpY};
 
 	if (idx) Helpers::panic("[PICA] CMP: idx != 0");
 	vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);
@@ -554,33 +574,34 @@ void PICAShader::cmp(u32 instruction) {
 
 	for (int i = 0; i < 2; i++) {
 		switch (cmpOperations[i]) {
-			case 0: // Equal
+			case 0:  // Equal
 				cmpRegister[i] = srcVec1[i] == srcVec2[i];
 				break;
 
-			case 1: // Not equal
+			case 1:  // Not equal
 				cmpRegister[i] = srcVec1[i] != srcVec2[i];
 				break;
 
-			case 2: // Less than
+			case 2:  // Less than
 				cmpRegister[i] = srcVec1[i] < srcVec2[i];
 				break;
 
-			case 3: // Less than or equal
+			case 3:  // Less than or equal
 				cmpRegister[i] = srcVec1[i] <= srcVec2[i];
 				break;
 
-			case 4: // Greater than
+			case 4:  // Greater than
 				cmpRegister[i] = srcVec1[i] > srcVec2[i];
 				break;
 
-			case 5: // Greater than or equal
+			case 5:  // Greater than or equal
 				cmpRegister[i] = srcVec1[i] >= srcVec2[i];
 				break;
 
-			default:
+			default: {
 				cmpRegister[i] = true;
 				break;
+			}
 		}
 	}
 }
@@ -604,7 +625,7 @@ void PICAShader::ifc(u32 instruction) {
 
 void PICAShader::ifu(u32 instruction) {
 	const u32 dest = getBits<10, 12>(instruction);
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
 
 	if (boolUniform & (1 << bit)) {
 		if (ifIndex >= 8) [[unlikely]]
@@ -615,8 +636,7 @@ void PICAShader::ifu(u32 instruction) {
 		auto& block = conditionalInfo[ifIndex++];
 		block.endingPC = dest;
 		block.newPC = dest + num;
-	}
-	else {
+	} else {
 		pc = dest;
 	}
 }
@@ -637,12 +657,12 @@ void PICAShader::call(u32 instruction) {
 
 void PICAShader::callc(u32 instruction) {
 	if (isCondTrue(instruction)) {
-		call(instruction); // Pls inline
+		call(instruction);  // Pls inline
 	}
 }
 
 void PICAShader::callu(u32 instruction) {
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
 
 	if (boolUniform & (1 << bit)) {
 		if (callIndex >= 4) [[unlikely]]
@@ -664,26 +684,27 @@ void PICAShader::loop(u32 instruction) {
 		Helpers::panic("[PICA] Overflowed loop stack");
 
 	u32 dest = getBits<10, 12>(instruction);
-	auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from
-	loopCounter = uniform.y();
+	auto& uniform = intUniforms[getBits<22, 2>(instruction)];  // The uniform we'll get loop info from
+	loopCounter = uniform[1];
 	auto& loop = loopInfo[loopIndex++];
 
 	loop.startingPC = pc;
-	loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here
-	loop.iterations = uniform.x() + 1;
-	loop.increment = uniform.z();
+	loop.endingPC = dest + 1;  // Loop is inclusive so we need + 1 here
+	loop.iterations = uniform[0] + 1;
+	loop.increment = uniform[2];
 }
 
 void PICAShader::jmpc(u32 instruction) {
-	if (isCondTrue(instruction))
+	if (isCondTrue(instruction)) {
 		pc = getBits<10, 12>(instruction);
+	}
 }
 
 void PICAShader::jmpu(u32 instruction) {
-	const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false
+	const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we want to compare to true, otherwise compare to false
 	const u32 dest = getBits<10, 12>(instruction);
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
 
-	if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want
+	if (((boolUniform >> bit) & 1) == test)  // Jump if the bool uniform is the value we want
 		pc = dest;
 }
\ No newline at end of file
diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp
index 6cbc2693..aa7b4c12 100644
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@@ -1,4 +1,5 @@
 #include "PICA/shader_unit.hpp"
+
 #include "cityhash.hpp"
 
 void ShaderUnit::reset() {
@@ -18,18 +19,18 @@ void PICAShader::reset() {
 	opDescriptorIndex = 0;
 	f32UniformTransfer = false;
 
-	const vec4f zero = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
+	const vec4f zero = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
 	inputs.fill(zero);
 	floatUniforms.fill(zero);
 	outputs.fill(zero);
 	tempRegisters.fill(zero);
 
 	for (auto& e : intUniforms) {
-		e.x() = e.y() = e.z() = e.w() = 0;
+		e[0] = e[1] = e[2] = e[3] = 0;
 	}
 
-	addrRegister.x() = 0;
-	addrRegister.y() = 0;
+	addrRegister[0] = 0;
+	addrRegister[1] = 0;
 	loopCounter = 0;
 
 	codeHashDirty = true;
diff --git a/src/core/action_replay.cpp b/src/core/action_replay.cpp
new file mode 100644
index 00000000..ad391b36
--- /dev/null
+++ b/src/core/action_replay.cpp
@@ -0,0 +1,210 @@
+#include "action_replay.hpp"
+
+ActionReplay::ActionReplay(Memory& mem, HIDService& hid) : mem(mem), hid(hid) { reset(); }
+
+void ActionReplay::reset() {
+	// Default value of storage regs is 0
+	storage1 = 0;
+	storage2 = 0;
+
+	// TODO: Is the active storage persistent or not?
+	activeStorage = &storage1;
+}
+
+void ActionReplay::runCheat(const Cheat& cheat) {
+	// Set offset and data registers to 0 at the start of a cheat
+	data1 = data2 = offset1 = offset2 = 0;
+	pc = 0;
+	ifStackIndex = 0;
+	loopStackIndex = 0;
+	running = true;
+
+	activeOffset = &offset1;
+	activeData = &data1;
+
+	while (running) {
+		// See if we can fetch 1 64-bit opcode, otherwise we're out of bounds. Cheats seem to end when going out of bounds?
+		if (pc + 1 >= cheat.size()) {
+			return;
+		}
+		// Fetch instruction
+		const u32 instruction = cheat[pc++];
+		
+		// Instructions D0000000 00000000 and D2000000 00000000 are unconditional
+		bool isUnconditional = cheat[pc] == 0 && (instruction == 0xD0000000 || instruction == 0xD2000000);
+		if (ifStackIndex > 0 && !isUnconditional && !ifStack[ifStackIndex - 1]) {
+			pc++; // Eat up dummy word
+			continue; // Skip conditional instructions where the condition is false
+		}
+		
+		runInstruction(cheat, instruction);
+	}
+}
+
+u8 ActionReplay::read8(u32 addr) { return mem.read8(addr); }
+u16 ActionReplay::read16(u32 addr) { return mem.read16(addr); }
+u32 ActionReplay::read32(u32 addr) { return mem.read32(addr); }
+
+// Some AR cheats seem to want to write to unmapped memory or memory that straight up does not exist
+
+#define MAKE_WRITE_HANDLER(size)                                                          \
+	void ActionReplay::write##size(u32 addr, u##size value) {                             \
+		auto pointerWrite = mem.getWritePointer(addr);                                    \
+		if (pointerWrite) {                                                               \
+			*(u##size*)pointerWrite = value;                                              \
+		} else {                                                                          \
+			auto pointerRead = mem.getReadPointer(addr);                                  \
+			if (pointerRead) {                                                            \
+				*(u##size*)pointerRead = value;                                           \
+			} else {                                                                      \
+				Helpers::warn("AR code tried to write to invalid address: %08X\n", addr); \
+			}                                                                             \
+		}                                                                                 \
+	}
+
+MAKE_WRITE_HANDLER(8)
+MAKE_WRITE_HANDLER(16)
+MAKE_WRITE_HANDLER(32)
+#undef MAKE_WRITE_HANDLER
+
+void ActionReplay::runInstruction(const Cheat& cheat, u32 instruction) {
+	// Top nibble determines the instruction type
+	const u32 type = instruction >> 28;
+
+	switch (type) {
+		// 32-bit write to [XXXXXXX + offset]
+		case 0x0: {
+			const u32 baseAddr = Helpers::getBits<0, 28>(instruction);
+			const u32 value = cheat[pc++];
+			write32(baseAddr + *activeOffset, value);
+			break;
+		}
+
+		// 16-bit write to [XXXXXXX + offset]
+		case 0x1: {
+			const u32 baseAddr = Helpers::getBits<0, 28>(instruction);
+			const u16 value = u16(cheat[pc++]);
+			write16(baseAddr + *activeOffset, value);
+			break;
+		}
+
+		// 8-bit write to [XXXXXXX + offset]
+		case 0x2: {
+			const u32 baseAddr = Helpers::getBits<0, 28>(instruction);
+			const u8 value = u8(cheat[pc++]);
+			write8(baseAddr + *activeOffset, value);
+			break;
+		}
+
+		// Less Than (YYYYYYYY < [XXXXXXX + offset])
+		case 0x4: {
+			const u32 baseAddr = Helpers::getBits<0, 28>(instruction);
+			const u32 imm = cheat[pc++];
+			const u32 value = read32(baseAddr + *activeOffset);
+			Helpers::panic("TODO: How do ActionReplay conditional blocks work?");
+			break;
+		}
+
+		case 0xD: executeDType(cheat, instruction); break;
+		default: Helpers::panic("Unimplemented ActionReplay instruction type %X", type); break;
+	}
+}
+
+void ActionReplay::executeDType(const Cheat& cheat, u32 instruction) {
+	switch (instruction) {
+		case 0xD3000000: offset1 = cheat[pc++]; break;
+		case 0xD3000001: offset2 = cheat[pc++]; break;
+		case 0xDC000000: *activeOffset += cheat[pc++]; break;
+
+		// DD000000 XXXXXXXX - if KEYPAD has value XXXXXXXX execute next block
+		case 0xDD000000: {
+			const u32 mask = cheat[pc++];
+			const u32 buttons = hid.getOldButtons();
+
+			pushConditionBlock((buttons & mask) == mask);
+			break;
+		}
+
+		// Offset register ops
+		case 0xDF000000: {
+			const u32 subopcode = cheat[pc++];
+			switch (subopcode) {
+				case 0x00000000: activeOffset = &offset1; break;
+				case 0x00000001: activeOffset = &offset2; break;
+				case 0x00010000: offset2 = offset1; break;
+				case 0x00010001: offset1 = offset2; break;
+				case 0x00020000: data1 = offset1; break;
+				case 0x00020001: data2 = offset2; break;
+				default:
+					Helpers::warn("Unknown ActionReplay offset operation");
+					running = false;
+					break;
+			}
+			break;
+		}
+
+		// Data register operations
+		case 0xDF000001: {
+			const u32 subopcode = cheat[pc++];
+			switch (subopcode) {
+				case 0x00000000: activeData = &data1; break;
+				case 0x00000001: activeData = &data2; break;
+
+				case 0x00010000: data2 = data1; break;
+				case 0x00010001: data1 = data2; break;
+				case 0x00020000: offset1 = data1; break;
+				case 0x00020001: offset2 = data2; break;
+				default:
+					Helpers::warn("Unknown ActionReplay data operation");
+					running = false;
+					break;
+			}
+			break;
+		}
+
+		// Storage register operations
+		case 0xDF000002: {
+			const u32 subopcode = cheat[pc++];
+			switch (subopcode) {
+				case 0x00000000: activeStorage = &storage1; break;
+				case 0x00000001: activeStorage = &storage2; break;
+
+				case 0x00010000: data1 = storage1; break;
+				case 0x00010001: data2 = storage2; break;
+				case 0x00020000: storage1 = data1; break;
+				case 0x00020001: storage2 = data2; break;
+				default:
+					Helpers::warn("Unknown ActionReplay data operation: %08X", subopcode);
+					running = false;
+					break;
+			}
+			break;
+		}
+
+		// Control flow block operations
+		case 0xD2000000: {
+			const u32 subopcode = cheat[pc++];
+			switch (subopcode) {
+				// Ends all loop/execute blocks	
+				case 0:
+					loopStackIndex = 0;
+					ifStackIndex = 0;
+					break;
+				default: Helpers::panic("Unknown ActionReplay control flow operation: %08X", subopcode); break;
+			}
+			break;
+		}
+
+		default: Helpers::panic("ActionReplay: Unimplemented d-type opcode: %08X", instruction); break;
+	}
+}
+
+void ActionReplay::pushConditionBlock(bool condition) {
+	if (ifStackIndex >= 32) {
+		Helpers::warn("ActionReplay if stack overflowed");
+		running = false;
+		return;
+	}
+
+	ifStack[ifStackIndex++] = condition;
+}
\ No newline at end of file
diff --git a/src/core/cheats.cpp b/src/core/cheats.cpp
new file mode 100644
index 00000000..4c63652b
--- /dev/null
+++ b/src/core/cheats.cpp
@@ -0,0 +1,28 @@
+#include "cheats.hpp"
+
+Cheats::Cheats(Memory& mem, HIDService& hid) : ar(mem, hid) { reset(); }
+
+void Cheats::reset() {
+	cheats.clear();  // Unload loaded cheats
+	ar.reset();      // Reset ActionReplay
+}
+
+void Cheats::addCheat(const Cheat& cheat) { cheats.push_back(cheat); }
+
+void Cheats::run() {
+	for (const Cheat& cheat : cheats) {
+		switch (cheat.type) {
+			case CheatType::ActionReplay: {
+				ar.runCheat(cheat.instructions);
+				break;
+			}
+
+			case CheatType::Gateway: {
+				Helpers::panic("Gateway cheats not supported yet! Only Action Replay is supported!");
+				break;
+			}
+
+			default: Helpers::panic("Unknown cheat type");
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/core/kernel/directory_operations.cpp b/src/core/kernel/directory_operations.cpp
index 2d5d7abc..d4cac064 100644
--- a/src/core/kernel/directory_operations.cpp
+++ b/src/core/kernel/directory_operations.cpp
@@ -1,3 +1,10 @@
+#include <array>
+#include <cctype>
+#include <filesystem>
+#include <string>
+#include <utility>
+
+#include "ipc.hpp"
 #include "kernel.hpp"
 
 namespace DirectoryOps {
@@ -7,6 +14,79 @@ namespace DirectoryOps {
 	};
 }
 
+// Helper to convert std::string to an 8.3 filename to mimic how Directory::Read works
+using ShortFilename = std::array<char, 9>;
+using ShortExtension = std::array<char, 4>;
+using Filename83 = std::pair<ShortFilename, ShortExtension>;
+
+// The input string should be the stem and extension together, not separately
+// Eg something like "boop.png", "panda.txt", etc
+Filename83 convertTo83(const std::string& path) {
+	ShortFilename filename;
+	ShortExtension extension;
+
+	// Convert a character to add it to the 8.3 name
+	// "Characters such as + are changed to the underscore _, and letters are put in uppercase"
+	// For now we put letters in uppercase until we find out what is supposed to be converted to _ and so on
+	auto convertCharacter = [](char c) { return (char) std::toupper(c); };
+
+	// List of forbidden character for 8.3 filenames, from Citra
+	// TODO: Use constexpr when C++20 support is solid
+	const std::string forbiddenChars = ".\"/\\[]:;=, ";
+
+	// By default space-initialize the whole name, append null terminator in the end for both the filename and extension
+	filename.fill(' ');
+	extension.fill(' ');
+	filename[filename.size() - 1] = '\0';
+	extension[extension.size() - 1] = '\0';
+
+	// Find the position of the dot in the string
+	auto dotPos = path.rfind('.');
+	// Wikipedia: If a file name has no extension, a trailing . has no effect
+	// Thus check if the last character is a dot and ignore it, prefering the previous dot if it exists
+	if (dotPos == path.size() - 1) {
+		dotPos = path.rfind('.', dotPos);  // Get previous dot
+	}
+
+	// If pointPos is not npos we have a valid dot character, and as such an extension
+	bool haveExtension = dotPos != std::string::npos;
+	int validCharacterCount = 0;
+	bool filenameTooBig = false;
+
+	// Parse characters until we're done OR until we reach 9 characters, in which case according to Wikipedia we must truncate to 6 letters
+	// And append ~1 in the end
+	for (auto c : path.substr(0, dotPos)) {
+		// Character is forbidden, we must ignore it
+		if (forbiddenChars.find(c) != std::string::npos) {
+			continue;
+		}
+
+		// We already have capped the amount of characters, thus our filename is too big
+		if (validCharacterCount == 8) {
+			filenameTooBig = true;
+			break;
+		}
+		filename[validCharacterCount++] = convertCharacter(c); // Append character to filename
+	}
+
+	// Truncate name to 6 characters and denote that it is too big
+	// TODO: Wikipedia says we should also do this if the filename contains an invalid character, including spaces. Must test
+	if (filenameTooBig) {
+		filename[6] = '~';
+		filename[7] = '1';
+	}
+
+	if (haveExtension) {
+		int extensionLen = 0;
+		// Copy up to 3 characters from the dot onwards to the extension
+		for (auto c : path.substr(dotPos + 1, 3)) {
+			extension[extensionLen++] = convertCharacter(c);
+		}
+	}
+
+	return {filename, extension};
+}
+
 void Kernel::handleDirectoryOperation(u32 messagePointer, Handle directory) {
 	const u32 cmd = mem.read32(messagePointer);
 	switch (cmd) {
@@ -25,16 +105,77 @@ void Kernel::closeDirectory(u32 messagePointer, Handle directory) {
 	}
 
 	p->getData<DirectorySession>()->isOpen = false;
+	mem.write32(messagePointer, IPC::responseHeader(0x802, 1, 0));
 	mem.write32(messagePointer + 4, Result::Success);
 }
 
-
 void Kernel::readDirectory(u32 messagePointer, Handle directory) {
 	const u32 entryCount = mem.read32(messagePointer + 4);
 	const u32 outPointer = mem.read32(messagePointer + 12);
 	logFileIO("Directory::Read (handle = %X, entry count = %d, out pointer = %08X)\n", directory, entryCount, outPointer);
-	Helpers::panicDev("Unimplemented FsDir::Read");
+	
+	const auto p = getObject(directory, KernelObjectType::Directory);
+	if (p == nullptr) [[unlikely]] {
+		Helpers::panic("Called ReadDirectory on non-existent directory");
+	}
 
+	DirectorySession* session = p->getData<DirectorySession>();
+	if (!session->pathOnDisk.has_value()) [[unlikely]] {
+		Helpers::panic("Called ReadDirectory on directory that doesn't have a path on disk");
+	}
+
+	std::filesystem::path dirPath = session->pathOnDisk.value();
+
+	int count = 0;
+	while (count < entryCount && session->currentEntry < session->entries.size()) {
+		const auto& entry = session->entries[session->currentEntry];
+		std::filesystem::path path = entry.path;
+		std::filesystem::path filename = path.filename();
+
+		std::filesystem::path relative = path.lexically_relative(dirPath);
+		bool isDirectory = std::filesystem::is_directory(relative);
+
+		std::u16string nameU16 = relative.u16string();
+		bool isHidden = nameU16[0] == u'.'; // If the first character is a dot then this is a hidden file/folder
+
+		const u32 entryPointer = outPointer + (count * 0x228); // 0x228 is the size of a single entry
+		u32 utfPointer = entryPointer;
+		u32 namePointer = entryPointer + 0x20C;
+		u32 extensionPointer = entryPointer + 0x216;
+		u32 attributePointer = entryPointer + 0x21C;
+		u32 sizePointer = entryPointer + 0x220;
+
+		std::string filenameString = filename.string();
+		auto [shortFilename, shortExtension] = convertTo83(filenameString);
+
+		for (auto c : nameU16) {
+			mem.write16(utfPointer, u16(c));
+			utfPointer += sizeof(u16);
+		}
+		mem.write16(utfPointer, 0); // Null terminate the UTF16 name
+
+		// Write 8.3 filename-extension
+		for (auto c : shortFilename) {
+			mem.write8(namePointer, u8(c));
+			namePointer += sizeof(u8);
+		}
+
+		for (auto c : shortExtension) {
+			mem.write8(extensionPointer, u8(c));
+			extensionPointer += sizeof(u8);
+		}
+
+		mem.write8(outPointer + 0x21A, 1);                            // Always 1 according to 3DBrew
+		mem.write8(attributePointer, entry.isDirectory ? 1 : 0);      // "Is directory" attribute
+		mem.write8(attributePointer + 1, isHidden ? 1 : 0);           // "Is hidden" attribute
+		mem.write8(attributePointer + 2, entry.isDirectory ? 0 : 1);  // "Is archive" attribute
+		mem.write8(attributePointer + 3, 0);                          // "Is read-only" attribute
+
+		count++;                  // Increment number of read directories
+		session->currentEntry++;  // Increment index of the entry currently being read
+	}
+
+	mem.write32(messagePointer, IPC::responseHeader(0x801, 2, 2));
 	mem.write32(messagePointer + 4, Result::Success);
-	mem.write32(messagePointer + 8, 0);
+	mem.write32(messagePointer + 8, count);
 }
diff --git a/src/core/kernel/kernel.cpp b/src/core/kernel/kernel.cpp
index 1402b468..8f3aeda0 100644
--- a/src/core/kernel/kernel.cpp
+++ b/src/core/kernel/kernel.cpp
@@ -95,14 +95,29 @@ KernelObject* Kernel::getProcessFromPID(Handle handle) {
 }
 
 void Kernel::deleteObjectData(KernelObject& object) {
-	using enum KernelObjectType;
-
-	// Resource limit and thread objects do not allocate heap data, so we don't delete anything
-	if (object.data == nullptr || object.type == ResourceLimit || object.type == Thread) {
+	if (object.data == nullptr) {
 		return;
 	}
 
-	delete object.data;
+	// Resource limit and thread objects do not allocate heap data, so we don't delete anything
+
+	switch (object.type) {
+		case KernelObjectType::AddressArbiter: delete object.getData<AddressArbiter>(); return;
+		case KernelObjectType::Archive: delete object.getData<ArchiveSession>(); return;
+		case KernelObjectType::Directory: delete object.getData<DirectorySession>(); return;
+		case KernelObjectType::Event: delete object.getData<Event>(); return;
+		case KernelObjectType::File: delete object.getData<FileSession>(); return;
+		case KernelObjectType::MemoryBlock: delete object.getData<MemoryBlock>(); return;
+		case KernelObjectType::Port: delete object.getData<Port>(); return;
+		case KernelObjectType::Process: delete object.getData<Process>(); return;
+		case KernelObjectType::ResourceLimit: return;
+		case KernelObjectType::Session: delete object.getData<Session>(); return;
+		case KernelObjectType::Mutex: delete object.getData<Mutex>(); return;
+		case KernelObjectType::Semaphore: delete object.getData<Semaphore>(); return;
+		case KernelObjectType::Thread: return;
+		case KernelObjectType::Dummy: return;
+		default: [[unlikely]] Helpers::warn("unknown object type"); return;
+	}
 }
 
 void Kernel::reset() {
@@ -240,4 +255,4 @@ std::string Kernel::getProcessName(u32 pid) {
 	} else {
 		Helpers::panic("Attempted to name non-current process");
 	}
-}
\ No newline at end of file
+}
diff --git a/src/core/renderer_gl/etc1.cpp b/src/core/renderer_gl/etc1.cpp
index 82f06724..8aefd622 100644
--- a/src/core/renderer_gl/etc1.cpp
+++ b/src/core/renderer_gl/etc1.cpp
@@ -9,7 +9,7 @@ static constexpr u32 signExtend3To32(u32 val) {
     return (u32)(s32(val) << 29 >> 29);
 }
 
-u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* data) {
+u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span<const u8> data) {
     // Pixel offset of the 8x8 tile based on u, v and the width of the texture
     u32 offs = ((u & ~7) * 8) + ((v & ~7) * width);
     if (!hasAlpha)
@@ -30,8 +30,7 @@ u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, const void* dat
     offs += subTileSize * subTileIndex;
 
     u32 alpha;
-    const u8* tmp = static_cast<const u8*>(data) + offs; // Pointer to colour and alpha data as u8*
-    const u64* ptr = reinterpret_cast<const u64*>(tmp); // Cast to u64*
+    const u64* ptr = reinterpret_cast<const u64*>(data.data() + offs); // Cast to u64*
 
     if (hasAlpha) {
         // First 64 bits of the 4x4 subtile are alpha data
@@ -118,4 +117,4 @@ u32 Texture::decodeETC(u32 alpha, u32 u, u32 v, u64 colourData) {
     b = std::clamp(b + modifier, 0, 255);
 
     return (alpha << 24) | (u32(b) << 16) | (u32(g) << 8) | u32(r);
-}
\ No newline at end of file
+}
diff --git a/src/gl_state.cpp b/src/core/renderer_gl/gl_state.cpp
similarity index 96%
rename from src/gl_state.cpp
rename to src/core/renderer_gl/gl_state.cpp
index 612ae44d..691eb7b6 100644
--- a/src/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@@ -1,4 +1,4 @@
-#include "gl_state.hpp"
+#include "renderer_gl/gl_state.hpp"
 
 void GLStateManager::resetBlend() {
 	blendEnabled = false;
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 3a13b31d..bef3fe93 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -1,582 +1,22 @@
 #include "renderer_gl/renderer_gl.hpp"
+
+#include <stb_image_write.h>
+
+#include <cmrc/cmrc.hpp>
+
 #include "PICA/float_types.hpp"
 #include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
 
+CMRC_DECLARE(RendererGL);
+
 using namespace Floats;
 using namespace Helpers;
 using namespace PICA;
 
-const char* vertexShader = R"(
-	#version 410 core
-	
-	layout (location = 0) in vec4  a_coords;
-	layout (location = 1) in vec4  a_quaternion;
-	layout (location = 2) in vec4  a_vertexColour;
-	layout (location = 3) in vec2  a_texcoord0;
-	layout (location = 4) in vec2  a_texcoord1;
-	layout (location = 5) in float a_texcoord0_w;
-	layout (location = 6) in vec3  a_view;
-	layout (location = 7) in vec2  a_texcoord2;
+RendererGL::~RendererGL() {}
 
-	out vec3 v_normal;
-	out vec3 v_tangent;
-	out vec3 v_bitangent;
-	out vec4 v_colour;
-	out vec3 v_texcoord0;
-	out vec2 v_texcoord1;
-	out vec3 v_view;
-	out vec2 v_texcoord2;
-	flat out vec4 v_textureEnvColor[6];
-	flat out vec4 v_textureEnvBufferColor;
-
-	out float gl_ClipDistance[2];
-
-	// TEV uniforms
-	uniform uint u_textureEnvColor[6];
-	uniform uint u_picaRegs[0x200 - 0x48];
-
-	// Helper so that the implementation of u_pica_regs can be changed later
-	uint readPicaReg(uint reg_addr){
-		return u_picaRegs[reg_addr - 0x48];
-	}
-
-	vec4 abgr8888ToVec4(uint abgr) {
-		const float scale = 1.0 / 255.0;
-
-		return scale * vec4(
-			float(abgr & 0xffu),
-			float((abgr >> 8) & 0xffu),
-			float((abgr >> 16) & 0xffu),
-			float(abgr >> 24)
-		);
-	}
-
-	vec3 rotateVec3ByQuaternion(vec3 v, vec4 q){
-		vec3 u = q.xyz;
-		float s = q.w;
-		return 2.0 * dot(u, v) * u + (s * s - dot(u, u))* v  + 2.0 * s * cross(u, v);
-	}
-
-	// Convert an arbitrary-width floating point literal to an f32
-	float decodeFP(uint hex, uint E, uint M){
-		uint width = M + E + 1u;
-		uint bias = 128u - (1u << (E - 1u));
-		uint exponent = (hex >> M) & ((1u << E) - 1u);
-		uint mantissa = hex & ((1u << M) - 1u);
-		uint sign = (hex >> (E + M)) << 31u;
-
-		if ((hex & ((1u << (width - 1u)) - 1u)) != 0) {
-			if (exponent == (1u << E) - 1u) exponent = 255u;
-			else exponent += bias;
-			hex = sign | (mantissa << (23u - M)) | (exponent << 23u);
-		} else {
-			hex = sign;
-		}
-
-        return uintBitsToFloat(hex);
-	}
-
-	void main() {
-		gl_Position = a_coords;
-		v_colour = a_vertexColour;
-
-		// Flip y axis of UVs because OpenGL uses an inverted y for texture sampling compared to the PICA
-		v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
-		v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
-		v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
-		v_view = a_view; 
-
-		v_normal    = normalize(rotateVec3ByQuaternion(vec3(0.0, 0.0, 1.0), a_quaternion));
-		v_tangent   = normalize(rotateVec3ByQuaternion(vec3(1.0, 0.0, 0.0), a_quaternion));
-		v_bitangent = normalize(rotateVec3ByQuaternion(vec3(0.0, 1.0, 0.0), a_quaternion));
-
-		for (int i = 0; i < 6; i++) {
-			v_textureEnvColor[i] = abgr8888ToVec4(u_textureEnvColor[i]);
-		}
-
-		v_textureEnvBufferColor = abgr8888ToVec4(readPicaReg(0xFD));
-
-		// Parse clipping plane registers
-		// The plane registers describe a clipping plane in the form of Ax + By + Cz + D = 0 
-		// With n = (A, B, C) being the normal vector and D being the origin point distance
-		// Therefore, for the second clipping plane, we can just pass the dot product of the clip vector and the input coordinates to gl_ClipDistance[1]
-		vec4 clipData = vec4(
-			decodeFP(readPicaReg(0x48) & 0xffffffu, 7, 16),
-			decodeFP(readPicaReg(0x49) & 0xffffffu, 7, 16),
-			decodeFP(readPicaReg(0x4A) & 0xffffffu, 7, 16),
-			decodeFP(readPicaReg(0x4B) & 0xffffffu, 7, 16)
-		);
-
-		// There's also another, always-on clipping plane based on vertex z
-		gl_ClipDistance[0] = -a_coords.z;
-		gl_ClipDistance[1] = dot(clipData, a_coords);
-	}
-)";
-
-const char* fragmentShader = R"(
-	#version 410 core
-	
-	in vec3 v_tangent;
-	in vec3 v_normal;
-	in vec3 v_bitangent;
-	in vec4 v_colour;
-	in vec3 v_texcoord0;
-	in vec2 v_texcoord1;
-	in vec3 v_view;
-	in vec2 v_texcoord2;
-	flat in vec4 v_textureEnvColor[6];
-	flat in vec4 v_textureEnvBufferColor;
-
-	out vec4 fragColour;
-
-	// TEV uniforms
-	uniform uint u_textureEnvSource[6];
-	uniform uint u_textureEnvOperand[6];
-	uniform uint u_textureEnvCombiner[6];
-	uniform uint u_textureEnvScale[6];
-
-	// Depth control uniforms
-	uniform float u_depthScale;
-	uniform float u_depthOffset;
-	uniform bool u_depthmapEnable;
-
-	uniform sampler2D u_tex0;
-	uniform sampler2D u_tex1;
-	uniform sampler2D u_tex2;
-	uniform sampler1DArray u_tex_lighting_lut;
-
-	uniform uint u_picaRegs[0x200 - 0x48];
-
-	// Helper so that the implementation of u_pica_regs can be changed later
-	uint readPicaReg(uint reg_addr){
-		return u_picaRegs[reg_addr - 0x48];
-	}
-
-	vec4 tevSources[16];
-	vec4 tevNextPreviousBuffer;
-	bool tevUnimplementedSourceFlag = false;
-
-	// OpenGL ES 1.1 reference pages for TEVs (this is what the PICA200 implements):
-	// https://registry.khronos.org/OpenGL-Refpages/es1.1/xhtml/glTexEnv.xml
-
-	vec4 tevFetchSource(uint src_id) {
-		if (src_id >= 6u && src_id < 13u) {
-			tevUnimplementedSourceFlag = true;
-		}
-
-		return tevSources[src_id];
-	}
-
-	vec4 tevGetColorAndAlphaSource(int tev_id, int src_id) {
-		vec4 result;
-
-		vec4 colorSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4)) & 15u);
-		vec4 alphaSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u);
-
-		uint colorOperand = (u_textureEnvOperand[tev_id] >> (src_id * 4)) & 15u;
-		uint alphaOperand = (u_textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u;
-
-		// TODO: figure out what the undocumented values do
-		switch (colorOperand) {
-			case  0u: result.rgb = colorSource.rgb; break;            // Source color
-			case  1u: result.rgb = 1.0 - colorSource.rgb; break;      // One minus source color
-			case  2u: result.rgb = vec3(colorSource.a); break;        // Source alpha
-			case  3u: result.rgb = vec3(1.0 - colorSource.a); break;  // One minus source alpha
-			case  4u: result.rgb = vec3(colorSource.r); break;        // Source red
-			case  5u: result.rgb = vec3(1.0 - colorSource.r); break;  // One minus source red
-			case  8u: result.rgb = vec3(colorSource.g); break;        // Source green
-			case  9u: result.rgb = vec3(1.0 - colorSource.g); break;  // One minus source green
-			case 12u: result.rgb = vec3(colorSource.b); break;        // Source blue
-			case 13u: result.rgb = vec3(1.0 - colorSource.b); break;  // One minus source blue
-			default: break;
-		}
-
-		// TODO: figure out what the undocumented values do
-		switch (alphaOperand) {
-			case 0u: result.a = alphaSource.a; break;        // Source alpha
-			case 1u: result.a = 1.0 - alphaSource.a; break;  // One minus source alpha
-			case 2u: result.a = alphaSource.r; break;        // Source red
-			case 3u: result.a = 1.0 - alphaSource.r; break;  // One minus source red
-			case 4u: result.a = alphaSource.g; break;        // Source green
-			case 5u: result.a = 1.0 - alphaSource.g; break;  // One minus source green
-			case 6u: result.a = alphaSource.b; break;        // Source blue
-			case 7u: result.a = 1.0 - alphaSource.b; break;  // One minus source blue
-			default: break;
-		}
-
-		return result;
-	}
-
-	vec4 tevCalculateCombiner(int tev_id) {
-		vec4 source0 = tevGetColorAndAlphaSource(tev_id, 0);
-		vec4 source1 = tevGetColorAndAlphaSource(tev_id, 1);
-		vec4 source2 = tevGetColorAndAlphaSource(tev_id, 2);
-
-		uint colorCombine = u_textureEnvCombiner[tev_id] & 15u;
-		uint alphaCombine = (u_textureEnvCombiner[tev_id] >> 16) & 15u;
-
-		vec4 result = vec4(1.0);
-
-		// TODO: figure out what the undocumented values do
-		switch (colorCombine) {
-			case 0u: result.rgb = source0.rgb; break;                                       // Replace
-			case 1u: result.rgb = source0.rgb * source1.rgb; break;                         // Modulate
-			case 2u: result.rgb = min(vec3(1.0), source0.rgb + source1.rgb); break;         // Add
-			case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break;  // Add signed
-			case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break;        // Interpolate
-			case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break;               // Subtract
-			case 6u: result.rgb = vec3(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break;  // Dot3 RGB
-			case 7u: result     = vec4(4.0 * dot(source0.rgb - 0.5 , source1.rgb - 0.5)); break;  // Dot3 RGBA
-			case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break;       // Multiply then add
-			case 9u: result.rgb = min((source0.rgb + source1.rgb) * source2.rgb, 1.0); break;     // Add then multiply
-			default: break;
-		}
-
-		if (colorCombine != 7u) { // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode.
-			// TODO: figure out what the undocumented values do
-			// TODO: test if the alpha combiner supports all the same modes as the color combiner.
-			switch (alphaCombine) {
-				case 0u: result.a = source0.a; break;                                      // Replace
-				case 1u: result.a = source0.a * source1.a; break;                          // Modulate
-				case 2u: result.a = min(1.0, source0.a + source1.a); break;                // Add
-				case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break;   // Add signed
-				case 4u: result.a = mix(source1.a, source0.a, source2.a); break;           // Interpolate
-				case 5u: result.a = max(0.0, source0.a - source1.a); break;                // Subtract
-				case 8u: result.a = min(1.0, source0.a * source1.a + source2.a); break;    // Multiply then add
-				case 9u: result.a = min(1.0, (source0.a + source1.a) * source2.a); break;  // Add then multiply
-				default: break;
-			}
-		}
-
-		result.rgb *= float(1 << (u_textureEnvScale[tev_id] & 3u));
-		result.a   *= float(1 << ((u_textureEnvScale[tev_id] >> 16) & 3u));
-
-		return result;
-	}
-
-	#define D0_LUT 0u
-	#define D1_LUT 1u
-	#define SP_LUT 2u
-	#define FR_LUT 3u
-	#define RB_LUT 4u
-	#define RG_LUT 5u
-	#define RR_LUT 6u
-
-	float lutLookup(uint lut, uint light, float value){
-		if (lut >= FR_LUT && lut <= RR_LUT)
-			lut -= 1;
-		if (lut==SP_LUT)
-			lut = light + 8;
-		return texture(u_tex_lighting_lut, vec2(value, lut)).r; 
-	}
-
-	vec3 regToColor(uint reg) {
-		// Normalization scale to convert from [0...255] to [0.0...1.0]
-		const float scale = 1.0 / 255.0;
-
-		return scale * vec3(
-			float(bitfieldExtract(reg, 20, 8)),
-			float(bitfieldExtract(reg, 10, 8)),
-			float(bitfieldExtract(reg, 00, 8))
-		);
-	}
-
-	// Convert an arbitrary-width floating point literal to an f32
-	float decodeFP(uint hex, uint E, uint M){
-		uint width = M + E + 1u;
-		uint bias = 128u - (1u << (E - 1u));
-		uint exponent = (hex >> M) & ((1u << E) - 1u);
-		uint mantissa = hex & ((1u << M) - 1u);
-		uint sign = (hex >> (E + M)) << 31u;
-
-		if ((hex & ((1u << (width - 1u)) - 1u)) != 0) {
-			if (exponent == (1u << E) - 1u) exponent = 255u;
-			else exponent += bias;
-			hex = sign | (mantissa << (23u - M)) | (exponent << 23u);
-		} else {
-			hex = sign;
-		}
-
-        return uintBitsToFloat(hex);
-	}
-
-	// Implements the following algorthm: https://mathb.in/26766
-	void calcLighting(out vec4 primary_color, out vec4 secondary_color){
-		// Quaternions describe a transformation from surface-local space to eye space.
-		// In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1),
-		// the tangent vector is (1,0,0), and the bitangent vector is (0,1,0).
-		vec3 normal    = normalize(v_normal   );
-		vec3 tangent   = normalize(v_tangent  );
-		vec3 bitangent = normalize(v_bitangent);
-		vec3 view = normalize(v_view);
-
-		uint GPUREG_LIGHTING_ENABLE  = readPicaReg(0x008F);
-		if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0){
-			primary_color = secondary_color = vec4(1.0);
-			return;
-		}
-
-		uint GPUREG_LIGHTING_AMBIENT = readPicaReg(0x01C0);
-		uint GPUREG_LIGHTING_NUM_LIGHTS = (readPicaReg(0x01C2) & 0x7u) +1;
-		uint GPUREG_LIGHTING_LIGHT_PERMUTATION = readPicaReg(0x01D9);
-
-		primary_color   = vec4(vec3(0.0),1.0);
-		secondary_color = vec4(vec3(0.0),1.0);
-
-		primary_color.rgb += regToColor(GPUREG_LIGHTING_AMBIENT);
-
-		uint GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0);
-		uint GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1);
-		uint GPUREG_LIGHTING_CONFIG0 = readPicaReg(0x01C3);
-		uint GPUREG_LIGHTING_CONFIG1 = readPicaReg(0x01C4);
-		uint GPUREG_LIGHTING_LUTINPUT_SCALE =  readPicaReg(0x01D2);
-		float d[7];
-
-		bool error_unimpl = false;
-
-		for (uint i = 0; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) {
-			uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION,int(i*3),3);
-		
-			uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_DIFFUSE = readPicaReg(0x0142 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_AMBIENT = readPicaReg(0x0143 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_VECTOR_LOW = readPicaReg(0x0144 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_VECTOR_HIGH= readPicaReg(0x0145 + 0x10 * light_id);
-			uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149 + 0x10 * light_id);
-
-			vec3 light_vector = normalize(vec3(
-				decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5, 10),
-				decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5, 10),
-				decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10)
-			));
-
-			// Positional Light
-			if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0)
-				error_unimpl = true;
-
-			vec3 half_vector = normalize(normalize(light_vector) + view);
-
-			for (int c = 0; c < 7; c++) {
-				if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0){
-					uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3);
-					float scale = float(1u << scale_id);
-					if (scale_id >= 6u)
-						scale/=256.0;
-
-					uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3);
-					if (input_id == 0u) d[c] = dot(normal,half_vector);
-					else if (input_id == 1u) d[c] = dot(view,half_vector);
-					else if (input_id == 2u) d[c] = dot(normal,view);
-					else if (input_id == 3u) d[c] = dot(light_vector,normal);
-					else if (input_id == 4u){
-						uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146 + 0x10 * light_id);
-						uint GPUREG_LIGHTi_SPOTDIR_HIGH= readPicaReg(0x0147 + 0x10 * light_id);
-						vec3 spot_light_vector = normalize(vec3(
-							decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1, 11),
-							decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1, 11),
-							decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1, 11)
-						));
-						d[c] = dot(-light_vector, spot_light_vector); // -L dot P (aka Spotlight aka SP);
-					} else if (input_id == 5u) {
-						d[c] = 1.0; // TODO: cos <greek symbol> (aka CP);
-						error_unimpl = true;
-					} else {
-						d[c] = 1.0;
-					}
-
-					d[c] = lutLookup(c, light_id, d[c] * 0.5 + 0.5) * scale;
-					if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) 
-						d[c] = abs(d[c]);
-				} else {
-					d[c] = 1.0;
-				}
-			}
-			
-			uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG,4,4);
-			if (lookup_config == 0) {
-				d[D1_LUT] = 0.0;
-				d[FR_LUT] = 0.0;
-				d[RG_LUT]= d[RB_LUT] = d[RR_LUT];
-			} else if (lookup_config == 1) {
-				d[D0_LUT] = 0.0;
-				d[D1_LUT] = 0.0;
-				d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
-			} else if (lookup_config == 2) {
-				d[FR_LUT] = 0.0;
-				d[SP_LUT] = 0.0;
-				d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
-			} else if (lookup_config == 3) {
-				d[SP_LUT] = 0.0;
-				d[RG_LUT]= d[RB_LUT] = d[RR_LUT] = 1.0;
-			} else if (lookup_config == 4) {
-				d[FR_LUT] = 0.0;
-			} else if (lookup_config == 5) {
-				d[D1_LUT] = 0.0;
-			} else if (lookup_config == 6) {
-				d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
-			}
-
-			float distance_factor = 1.0; // a
-			float indirect_factor = 1.0; // fi
-			float shadow_factor = 1.0;   // o
-
-			float NdotL = dot(normal, light_vector); //Li dot N
-
-			// Two sided diffuse
-			if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0) NdotL = max(0.0, NdotL);
-			else NdotL = abs(NdotL);
-
-			float light_factor =  distance_factor*d[SP_LUT]*indirect_factor*shadow_factor;
-
-			primary_color.rgb   += light_factor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE)*NdotL);
-			secondary_color.rgb += light_factor * (
-									 regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] +
-									 regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT])
-									);
-		}	
-		uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1);
-		uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1);
-
-		if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT];
-		if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT];
-
-		if (error_unimpl) {
-			secondary_color = primary_color = vec4(1.0,0.,1.0,1.0);
-		}
-	}
-
-	void main() {
-		// TODO: what do invalid sources and disabled textures read as?
-		// And what does the "previous combiner" source read initially?
-		tevSources[0] = v_colour; // Primary/vertex color
-		calcLighting(tevSources[1],tevSources[2]);
-
-		uint textureConfig = readPicaReg(0x80);
-		vec2 tex2UV = (textureConfig & (1u << 13)) != 0u ? v_texcoord1 : v_texcoord2;
-
-		if ((textureConfig & 1u) != 0u) tevSources[3] = texture(u_tex0, v_texcoord0.xy);
-		if ((textureConfig & 2u) != 0u) tevSources[4] = texture(u_tex1, v_texcoord1);
-		if ((textureConfig & 4u) != 0u) tevSources[5] = texture(u_tex2, tex2UV);
-		tevSources[13] = vec4(0.0); // Previous buffer
-		tevSources[15] = vec4(0.0); // Previous combiner
-
-		tevNextPreviousBuffer = v_textureEnvBufferColor;
-		uint textureEnvUpdateBuffer = readPicaReg(0xE0);
-
-		for (int i = 0; i < 6; i++) {
-			tevSources[14] = v_textureEnvColor[i]; // Constant color
-			tevSources[15] = tevCalculateCombiner(i);
-			tevSources[13] = tevNextPreviousBuffer;
-
-			if (i < 4) {
-				if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) {
-					tevNextPreviousBuffer.rgb = tevSources[15].rgb;
-				}
-
-				if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) {
-					tevNextPreviousBuffer.a = tevSources[15].a;
-				}
-			}
-		}
-
-		fragColour = tevSources[15];
-
-		if (tevUnimplementedSourceFlag) {
-			 // fragColour = vec4(1.0, 0.0, 1.0, 1.0);
-		}
-		// fragColour.rg = texture(u_tex_lighting_lut,vec2(gl_FragCoord.x/200.,float(int(gl_FragCoord.y/2)%24))).rr;
-
-
-		// Get original depth value by converting from [near, far] = [0, 1] to [-1, 1]
-		// We do this by converting to [0, 2] first and subtracting 1 to go to [-1, 1]
-		float z_over_w = gl_FragCoord.z * 2.0f - 1.0f;
-		float depth = z_over_w * u_depthScale + u_depthOffset;
-
-		if (!u_depthmapEnable) // Divide z by w if depthmap enable == 0 (ie using W-buffering)
-			depth /= gl_FragCoord.w;
-
-		// Write final fragment depth
-		gl_FragDepth = depth;
-
-		// Perform alpha test
-		uint alphaControl = readPicaReg(0x104);
-		if ((alphaControl & 1u) != 0u) { // Check if alpha test is on
-			uint func = (alphaControl >> 4u) & 7u;
-			float reference = float((alphaControl >> 8u) & 0xffu) / 255.0;
-			float alpha = fragColour.a;
-
-			switch (func) {
-				case 0: discard; // Never pass alpha test
-				case 1: break;          // Always pass alpha test
-				case 2:                 // Pass if equal
-					if (alpha != reference)
-						discard;
-					break;
-				case 3:                 // Pass if not equal
-					if (alpha == reference)
-						discard;
-					break;
-				case 4:                 // Pass if less than
-					if (alpha >= reference)
-						discard;
-					break;
-				case 5:                 // Pass if less than or equal
-					if (alpha > reference)
-						discard;
-					break;
-				case 6:                 // Pass if greater than
-					if (alpha <= reference)
-						discard;
-					break;
-				case 7:                 // Pass if greater than or equal
-					if (alpha < reference)
-						discard;
-					break;
-			}
-		}
-	}
-)";
-
-const char* displayVertexShader = R"(
-	#version 410 core
-	out vec2 UV;
-
-	void main() {
-		const vec4 positions[4] = vec4[](
-          vec4(-1.0, 1.0, 1.0, 1.0),    // Top-left
-          vec4(1.0, 1.0, 1.0, 1.0),     // Top-right
-          vec4(-1.0, -1.0, 1.0, 1.0),   // Bottom-left
-          vec4(1.0, -1.0, 1.0, 1.0)     // Bottom-right
-        );
-
-		// The 3DS displays both screens' framebuffer rotated 90 deg counter clockwise
-		// So we adjust our texcoords accordingly
-		const vec2 texcoords[4] = vec2[](
-				vec2(1.0, 1.0), // Top-right
-				vec2(1.0, 0.0), // Bottom-right
-				vec2(0.0, 1.0), // Top-left
-				vec2(0.0, 0.0)  // Bottom-left
-	);
-
-		gl_Position = positions[gl_VertexID];
-	UV = texcoords[gl_VertexID];
-	}
-)";
-
-const char* displayFragmentShader = R"(
-    #version 410 core
-    in vec2 UV;
-    out vec4 FragColor;
-
-    uniform sampler2D u_texture;
-    void main() {
-		FragColor = texture(u_texture, UV);
-    }
-)";
-
-void Renderer::reset() {
+void RendererGL::reset() {
 	depthBufferCache.reset();
 	colourBufferCache.reset();
 	textureCache.reset();
@@ -592,10 +32,10 @@ void Renderer::reset() {
 		const auto oldProgram = OpenGL::getProgram();
 
 		gl.useProgram(triangleProgram);
-		
-		oldDepthScale = -1.0; // Default depth scale to -1.0, which is what games typically use
-		oldDepthOffset = 0.0; // Default depth offset to 0
-		oldDepthmapEnable = false; // Enable w buffering
+
+		oldDepthScale = -1.0;       // Default depth scale to -1.0, which is what games typically use
+		oldDepthOffset = 0.0;       // Default depth offset to 0
+		oldDepthmapEnable = false;  // Enable w buffering
 
 		glUniform1f(depthScaleLoc, oldDepthScale);
 		glUniform1f(depthOffsetLoc, oldDepthOffset);
@@ -605,10 +45,17 @@ void Renderer::reset() {
 	}
 }
 
-void Renderer::initGraphicsContext() {
-	OpenGL::Shader vert(vertexShader, OpenGL::Vertex);
-	OpenGL::Shader frag(fragmentShader, OpenGL::Fragment);
-	triangleProgram.create({ vert, frag });
+void RendererGL::initGraphicsContext() {
+	gl.reset();
+
+	auto gl_resources = cmrc::RendererGL::get_filesystem();
+
+	auto vertexShaderSource = gl_resources.open("opengl_vertex_shader.vert");
+	auto fragmentShaderSource = gl_resources.open("opengl_fragment_shader.frag");
+
+	OpenGL::Shader vert({vertexShaderSource.begin(), vertexShaderSource.size()}, OpenGL::Vertex);
+	OpenGL::Shader frag({fragmentShaderSource.begin(), fragmentShaderSource.size()}, OpenGL::Fragment);
+	triangleProgram.create({vert, frag});
 	gl.useProgram(triangleProgram);
 
 	textureEnvSourceLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvSource");
@@ -628,12 +75,15 @@ void Renderer::initGraphicsContext() {
 	glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(triangleProgram, "u_tex_lighting_lut"), 3);
 
-	OpenGL::Shader vertDisplay(displayVertexShader, OpenGL::Vertex);
-	OpenGL::Shader fragDisplay(displayFragmentShader, OpenGL::Fragment);
-	displayProgram.create({ vertDisplay, fragDisplay });
+	auto displayVertexShaderSource = gl_resources.open("opengl_display.vert");
+	auto displayFragmentShaderSource = gl_resources.open("opengl_display.frag");
+
+	OpenGL::Shader vertDisplay({displayVertexShaderSource.begin(), displayVertexShaderSource.size()}, OpenGL::Vertex);
+	OpenGL::Shader fragDisplay({displayFragmentShaderSource.begin(), displayFragmentShaderSource.size()}, OpenGL::Fragment);
+	displayProgram.create({vertDisplay, fragDisplay});
 
 	gl.useProgram(displayProgram);
-	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object
+	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object
 
 	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
@@ -669,10 +119,10 @@ void Renderer::initGraphicsContext() {
 	dummyVAO.create();
 
 	// Create texture and framebuffer for the 3DS screen
-	const u32 screenTextureWidth = 400; // Top screen is 400 pixels wide, bottom is 320
-	const u32 screenTextureHeight = 2 * 240; // Both screens are 240 pixels tall
-	
-	glGenTextures(1,&lightLUTTextureArray);
+	const u32 screenTextureWidth = 400;       // Top screen is 400 pixels wide, bottom is 320
+	const u32 screenTextureHeight = 2 * 240;  // Both screens are 240 pixels tall
+
+	glGenTextures(1, &lightLUTTextureArray);
 
 	auto prevTexture = OpenGL::getTex2D();
 	screenTexture.create(screenTextureWidth, screenTextureHeight, GL_RGBA8);
@@ -684,8 +134,7 @@ void Renderer::initGraphicsContext() {
 	screenFramebuffer.createWithDrawTexture(screenTexture);
 	screenFramebuffer.bind(OpenGL::DrawAndReadFramebuffer);
 
-	if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE)
-		Helpers::panic("Incomplete framebuffer");
+	if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) Helpers::panic("Incomplete framebuffer");
 
 	// TODO: This should not clear the framebuffer contents. It should load them from VRAM.
 	GLint oldViewport[4];
@@ -699,19 +148,32 @@ void Renderer::initGraphicsContext() {
 }
 
 // Set up the OpenGL blending context to match the emulated PICA
-void Renderer::setupBlending() {
+void RendererGL::setupBlending() {
 	const bool blendingEnabled = (regs[PICA::InternalRegs::ColourOperation] & (1 << 8)) != 0;
-	
+
 	// Map of PICA blending equations to OpenGL blending equations. The unused blending equations are equivalent to equation 0 (add)
 	static constexpr std::array<GLenum, 8> blendingEquations = {
-		GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD
+		GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD,
 	};
-	
+
 	// Map of PICA blending funcs to OpenGL blending funcs. Func = 15 is undocumented and stubbed to GL_ONE for now
 	static constexpr std::array<GLenum, 16> blendingFuncs = {
-		GL_ZERO, GL_ONE, GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, GL_DST_COLOR, GL_ONE_MINUS_DST_COLOR, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA,
-		GL_DST_ALPHA, GL_ONE_MINUS_DST_ALPHA, GL_CONSTANT_COLOR, GL_ONE_MINUS_CONSTANT_COLOR, GL_CONSTANT_ALPHA, GL_ONE_MINUS_CONSTANT_ALPHA,
-		GL_SRC_ALPHA_SATURATE, GL_ONE
+		GL_ZERO,
+		GL_ONE,
+		GL_SRC_COLOR,
+		GL_ONE_MINUS_SRC_COLOR,
+		GL_DST_COLOR,
+		GL_ONE_MINUS_DST_COLOR,
+		GL_SRC_ALPHA,
+		GL_ONE_MINUS_SRC_ALPHA,
+		GL_DST_ALPHA,
+		GL_ONE_MINUS_DST_ALPHA,
+		GL_CONSTANT_COLOR,
+		GL_ONE_MINUS_CONSTANT_COLOR,
+		GL_CONSTANT_ALPHA,
+		GL_ONE_MINUS_CONSTANT_ALPHA,
+		GL_SRC_ALPHA_SATURATE,
+		GL_ONE,
 	};
 
 	if (!blendingEnabled) {
@@ -743,13 +205,12 @@ void Renderer::setupBlending() {
 	}
 }
 
-void Renderer::setupTextureEnvState() {
+void RendererGL::setupTextureEnvState() {
 	// TODO: Only update uniforms when the TEV config changed. Use an UBO potentially.
 
 	static constexpr std::array<u32, 6> ioBases = {
-	  PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source,
-	  PICA::InternalRegs::TexEnv2Source, PICA::InternalRegs::TexEnv3Source,
-	  PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source
+		PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, PICA::InternalRegs::TexEnv2Source,
+		PICA::InternalRegs::TexEnv3Source, PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source,
 	};
 
 	u32 textureEnvSourceRegs[6];
@@ -775,9 +236,11 @@ void Renderer::setupTextureEnvState() {
 	glUniform1uiv(textureEnvScaleLoc, 6, textureEnvScaleRegs);
 }
 
-void Renderer::bindTexturesToSlots() {
+void RendererGL::bindTexturesToSlots() {
 	static constexpr std::array<u32, 3> ioBases = {
-	  PICA::InternalRegs::Tex0BorderColor, PICA::InternalRegs::Tex1BorderColor, PICA::InternalRegs::Tex2BorderColor
+		PICA::InternalRegs::Tex0BorderColor,
+		PICA::InternalRegs::Tex1BorderColor,
+		PICA::InternalRegs::Tex2BorderColor,
 	};
 
 	for (int i = 0; i < 3; i++) {
@@ -805,13 +268,13 @@ void Renderer::bindTexturesToSlots() {
 	glActiveTexture(GL_TEXTURE0);
 }
 
-void Renderer::updateLightingLUT() {
+void RendererGL::updateLightingLUT() {
 	gpu.lightingLUTDirty = false;
-	std::array<u16, GPU::LightingLutSize> u16_lightinglut; 
-	
+	std::array<u16, GPU::LightingLutSize> u16_lightinglut;
+
 	for (int i = 0; i < gpu.lightingLUT.size(); i++) {
-		uint64_t value =  gpu.lightingLUT[i] & ((1 << 12) - 1);
-		u16_lightinglut[i] = value * 65535 / 4095; 
+		uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1);
+		u16_lightinglut[i] = value * 65535 / 4095;
 	}
 
 	glActiveTexture(GL_TEXTURE0 + 3);
@@ -824,19 +287,22 @@ void Renderer::updateLightingLUT() {
 	glActiveTexture(GL_TEXTURE0);
 }
 
-void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> vertices) {
+void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> vertices) {
 	// The fourth type is meant to be "Geometry primitive". TODO: Find out what that is
 	static constexpr std::array<OpenGL::Primitives, 4> primTypes = {
-	  OpenGL::Triangle, OpenGL::TriangleStrip, OpenGL::TriangleFan, OpenGL::Triangle
+		OpenGL::Triangle,
+		OpenGL::TriangleStrip,
+		OpenGL::TriangleFan,
+		OpenGL::Triangle,
 	};
-	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 
+	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
 	gl.bindVBO(vbo);
 	gl.bindVAO(vao);
 	gl.useProgram(triangleProgram);
 
-	OpenGL::enableClipPlane(0); // Clipping plane 0 is always enabled
+	OpenGL::enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
 		OpenGL::enableClipPlane(1);
 	}
@@ -852,9 +318,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
 
-	static constexpr std::array<GLenum, 8> depthModes = {
-		GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL
-	};
+	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
 
 	const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
 	const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
@@ -865,7 +329,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 		oldDepthScale = depthScale;
 		glUniform1f(depthScaleLoc, depthScale);
 	}
-	
+
 	if (oldDepthOffset != depthOffset) {
 		oldDepthOffset = depthOffset;
 		glUniform1f(depthOffsetLoc, depthOffset);
@@ -917,7 +381,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 constexpr u32 topScreenBuffer = 0x1f000000;
 constexpr u32 bottomScreenBuffer = 0x1f05dc00;
 
-void Renderer::display() {
+void RendererGL::display() {
 	gl.disableScissor();
 
 	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
@@ -925,7 +389,7 @@ void Renderer::display() {
 	glBlitFramebuffer(0, 0, 400, 480, 0, 0, 400, 480, GL_COLOR_BUFFER_BIT, GL_LINEAR);
 }
 
-void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
+void RendererGL::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
 	return;
 	log("GPU: Clear buffer\nStart: %08X End: %08X\nValue: %08X Control: %08X\n", startAddress, endAddress, value, control);
 
@@ -947,10 +411,10 @@ void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 cont
 	OpenGL::clearColor();
 }
 
-OpenGL::Framebuffer Renderer::getColourFBO() {
-	//We construct a colour buffer object and see if our cache has any matching colour buffers in it
-	// If not, we allocate a texture & FBO for our framebuffer and store it in the cache 
-	ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize.x(), fbSize.y());
+OpenGL::Framebuffer RendererGL::getColourFBO() {
+	// We construct a colour buffer object and see if our cache has any matching colour buffers in it
+	//  If not, we allocate a texture & FBO for our framebuffer and store it in the cache
+	ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize[0], fbSize[1]);
 	auto buffer = colourBufferCache.find(sampleBuffer);
 
 	if (buffer.has_value()) {
@@ -960,9 +424,9 @@ OpenGL::Framebuffer Renderer::getColourFBO() {
 	}
 }
 
-void Renderer::bindDepthBuffer() {
+void RendererGL::bindDepthBuffer() {
 	// Similar logic as the getColourFBO function
-	DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize.x(), fbSize.y());
+	DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize[0], fbSize[1]);
 	auto buffer = depthBufferCache.find(sampleBuffer);
 	GLuint tex;
 
@@ -979,14 +443,14 @@ void Renderer::bindDepthBuffer() {
 	glFramebufferTexture2D(GL_FRAMEBUFFER, attachment, GL_TEXTURE_2D, tex, 0);
 }
 
-OpenGL::Texture Renderer::getTexture(Texture& tex) {
+OpenGL::Texture RendererGL::getTexture(Texture& tex) {
 	// Similar logic as the getColourFBO/bindDepthBuffer functions
 	auto buffer = textureCache.find(tex);
 
 	if (buffer.has_value()) {
 		return buffer.value().get().texture;
 	} else {
-		const void* textureData = gpu.getPointerPhys<void*>(tex.location); // Get pointer to the texture data in 3DS memory
+		const auto textureData = std::span{gpu.getPointerPhys<u8>(tex.location), tex.sizeInBytes()};  // Get pointer to the texture data in 3DS memory
 		Texture& newTex = textureCache.add(tex);
 		newTex.decodeTexture(textureData);
 
@@ -994,7 +458,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
 	}
 }
 
-void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
+void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
 	const u32 inputWidth = inputSize & 0xffff;
 	const u32 inputGap = inputSize >> 16;
 
@@ -1022,21 +486,21 @@ void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32
 	// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
 	// We consider output gap == 320 to mean bottom, and anything else to mean top
 	if (outputGap == 320) {
-		OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport
+		OpenGL::setViewport(40, 0, 320, 240);  // Bottom screen viewport
 	} else {
-		OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
+		OpenGL::setViewport(0, 240, 400, 240);  // Top screen viewport
 	}
 
-	OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
+	OpenGL::draw(OpenGL::TriangleStrip, 4);  // Actually draw our 3DS screen
 }
 
-void Renderer::screenshot(const std::string& name) {
+void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;
 
 	std::vector<uint8_t> pixels, flippedPixels;
-	pixels.resize(width *  height * 4);
-	flippedPixels.resize(pixels.size());;
+	pixels.resize(width * height * 4);
+	flippedPixels.resize(pixels.size());
 
 	OpenGL::bindScreenFramebuffer();
 	glReadPixels(0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, pixels.data());
@@ -1053,4 +517,4 @@ void Renderer::screenshot(const std::string& name) {
 	}
 
 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
-}
\ No newline at end of file
+}
diff --git a/src/core/renderer_gl/textures.cpp b/src/core/renderer_gl/textures.cpp
index 819bf783..9e303fd9 100644
--- a/src/core/renderer_gl/textures.cpp
+++ b/src/core/renderer_gl/textures.cpp
@@ -112,12 +112,11 @@ u32 Texture::getSwizzledOffset_4bpp(u32 u, u32 v, u32 width) {
 // Get the texel at position (u, v)
 // fmt: format of the texture
 // data: texture data of the texture
-u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
+u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data) {
     switch (fmt) {
         case PICA::TextureFmt::RGBA4: {
             u32 offset = getSwizzledOffset(u, v, size.u(), 2);
-            auto ptr = static_cast<const u8*>(data);
-            u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8);
+            u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
 
             u8 alpha = Colour::convert4To8Bit(getBits<0, 4, u8>(texel));
             u8 b = Colour::convert4To8Bit(getBits<4, 4, u8>(texel));
@@ -128,9 +127,8 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
         }
 
         case PICA::TextureFmt::RGBA5551: {
-            u32 offset = getSwizzledOffset(u, v, size.u(), 2);
-            auto ptr = static_cast<const u8*>(data);
-            u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8);
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
 
             u8 alpha = getBit<0>(texel) ? 0xff : 0;
             u8 b = Colour::convert5To8Bit(getBits<1, 5, u8>(texel));
@@ -141,56 +139,47 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
         }
 
         case PICA::TextureFmt::RGB565: {
-            u32 offset = getSwizzledOffset(u, v, size.u(), 2);
-            auto ptr = static_cast<const u8*>(data);
-            u16 texel = u16(ptr[offset]) | (u16(ptr[offset + 1]) << 8);
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
 
-            u8 b = Colour::convert5To8Bit(getBits<0, 5, u8>(texel));
-            u8 g = Colour::convert6To8Bit(getBits<5, 6, u8>(texel));
-            u8 r = Colour::convert5To8Bit(getBits<11, 5, u8>(texel));
+            const u8 b = Colour::convert5To8Bit(getBits<0, 5, u8>(texel));
+            const u8 g = Colour::convert6To8Bit(getBits<5, 6, u8>(texel));
+            const u8 r = Colour::convert5To8Bit(getBits<11, 5, u8>(texel));
 
             return (0xff << 24) | (b << 16) | (g << 8) | r;
         }
 
         case PICA::TextureFmt::RG8: {
             u32 offset = getSwizzledOffset(u, v, size.u(), 2);
-            auto ptr = static_cast<const u8*>(data);
-
             constexpr u8 b = 0;
-            u8 g = ptr[offset];
-            u8 r = ptr[offset + 1];
+            const u8 g = data[offset];
+            const u8 r = data[offset + 1];
 
             return (0xff << 24) | (b << 16) | (g << 8) | r;
         }
 
         case PICA::TextureFmt::RGB8: {
-            u32 offset = getSwizzledOffset(u, v, size.u(), 3);
-            auto ptr = static_cast<const u8*>(data);
-
-            u8 b = ptr[offset];
-            u8 g = ptr[offset + 1];
-            u8 r = ptr[offset + 2];
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 3);
+            const u8 b = data[offset];
+            const u8 g = data[offset + 1];
+            const u8 r = data[offset + 2];
 
             return (0xff << 24) | (b << 16) | (g << 8) | r;
         }
 
         case PICA::TextureFmt::RGBA8: {
-            u32 offset = getSwizzledOffset(u, v, size.u(), 4);
-            auto ptr = static_cast<const u8*>(data);
-
-            u8 alpha = ptr[offset];
-            u8 b = ptr[offset + 1];
-            u8 g = ptr[offset + 2];
-            u8 r = ptr[offset + 3];
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 4);
+            const u8 alpha = data[offset];
+            const u8 b = data[offset + 1];
+            const u8 g = data[offset + 2];
+            const u8 r = data[offset + 3];
 
             return (alpha << 24) | (b << 16) | (g << 8) | r;
         }
 
         case PICA::TextureFmt::IA4: {
-            u32 offset = getSwizzledOffset(u, v, size.u(), 1);
-            auto ptr = static_cast<const u8*>(data);
-
-            const u8 texel = ptr[offset];
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 1);
+            const u8 texel = data[offset];
             const u8 alpha = Colour::convert4To8Bit(texel & 0xf);
             const u8 intensity = Colour::convert4To8Bit(texel >> 4);
 
@@ -199,11 +188,10 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
         }
 
         case PICA::TextureFmt::A4: {
-            u32 offset = getSwizzledOffset_4bpp(u, v, size.u());
-            auto ptr = static_cast<const u8*>(data);
+            const u32 offset = getSwizzledOffset_4bpp(u, v, size.u());
 
             // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates
-            u8 alpha = ptr[offset] >> ((u % 2) ? 4 : 0);
+            u8 alpha = data[offset] >> ((u % 2) ? 4 : 0);
             alpha = Colour::convert4To8Bit(getBits<0, 4>(alpha));
 
             // A8 sets RGB to 0
@@ -212,8 +200,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
 
         case PICA::TextureFmt::A8: {
             u32 offset = getSwizzledOffset(u, v, size.u(), 1);
-            auto ptr = static_cast<const u8*>(data);
-            const u8 alpha = ptr[offset];
+            const u8 alpha = data[offset];
 
             // A8 sets RGB to 0
             return (alpha << 24) | (0 << 16) | (0 << 8) | 0;
@@ -221,10 +208,9 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
 
         case PICA::TextureFmt::I4: {
             u32 offset = getSwizzledOffset_4bpp(u, v, size.u());
-            auto ptr = static_cast<const u8*>(data);
 
             // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates
-            u8 intensity = ptr[offset] >> ((u % 2) ? 4 : 0);
+            u8 intensity = data[offset] >> ((u % 2) ? 4 : 0);
             intensity = Colour::convert4To8Bit(getBits<0, 4>(intensity));
 
             // Intensity formats just copy the intensity value to every colour channel
@@ -233,8 +219,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
 
         case PICA::TextureFmt::I8: {
             u32 offset = getSwizzledOffset(u, v, size.u(), 1);
-            auto ptr = static_cast<const u8*>(data);
-            const u8 intensity = ptr[offset];
+            const u8 intensity = data[offset];
 
             // Intensity formats just copy the intensity value to every colour channel
             return (0xff << 24) | (intensity << 16) | (intensity << 8) | intensity;
@@ -242,11 +227,10 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
 
         case PICA::TextureFmt::IA8: {
             u32 offset = getSwizzledOffset(u, v, size.u(), 2);
-            auto ptr = static_cast<const u8*>(data);
 
             // Same as I8 except each pixel gets its own alpha value too
-            const u8 alpha = ptr[offset];
-            const u8 intensity = ptr[offset + 1];
+            const u8 alpha = data[offset];
+            const u8 intensity = data[offset + 1];
             return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity;
         }
 
@@ -258,7 +242,7 @@ u32 Texture::decodeTexel(u32 u, u32 v, PICA::TextureFmt fmt, const void* data) {
     }
 }
 
-void Texture::decodeTexture(const void* data) {
+void Texture::decodeTexture(std::span<const u8> data) {
     std::vector<u32> decoded;
     decoded.reserve(u64(size.u()) * u64(size.v()));
 
@@ -272,4 +256,4 @@ void Texture::decodeTexture(const void* data) {
 
     texture.bind();
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, size.u(), size.v(), GL_RGBA, GL_UNSIGNED_BYTE, decoded.data());
-}
\ No newline at end of file
+}
diff --git a/src/core/renderer_null/renderer_null.cpp b/src/core/renderer_null/renderer_null.cpp
new file mode 100644
index 00000000..9df2ddeb
--- /dev/null
+++ b/src/core/renderer_null/renderer_null.cpp
@@ -0,0 +1,12 @@
+#include "renderer_null/renderer_null.hpp"
+
+RendererNull::RendererNull(GPU& gpu, const std::array<u32, regNum>& internalRegs) : Renderer(gpu, internalRegs) {}
+RendererNull::~RendererNull() {}
+
+void RendererNull::reset() {}
+void RendererNull::display() {}
+void RendererNull::initGraphicsContext() {}
+void RendererNull::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {}
+void RendererNull::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {}
+void RendererNull::drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) {}
+void RendererNull::screenshot(const std::string& name) {}
\ No newline at end of file
diff --git a/src/emulator.cpp b/src/emulator.cpp
index 0ae60543..ce42d273 100644
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@@ -1,6 +1,8 @@
 #include "emulator.hpp"
 
-#include <stb_image_write.h>
+#ifdef PANDA3DS_ENABLE_OPENGL
+#include <glad/gl.h>
+#endif
 
 #ifdef _WIN32
 #include <windows.h>
@@ -12,7 +14,9 @@ __declspec(dllexport) DWORD AmdPowerXpressRequestHighPerformance = 1;
 }
 #endif
 
-Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory, gl, config), memory(cpu.getTicksRef()) {
+Emulator::Emulator()
+	: config(std::filesystem::current_path() / "config.toml"), kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory, config),
+	  memory(cpu.getTicksRef()), cheats(memory, kernel.getServiceManager().getHID()) {
 	if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0) {
 		Helpers::panic("Failed to initialize SDL2");
 	}
@@ -23,25 +27,29 @@ Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory
 		Helpers::warn("Failed to initialize SDL2 GameController: %s", SDL_GetError());
 	}
 
-	// Request OpenGL 4.1 Core (Max available on MacOS)
-	// MacOS gets mad if we don't explicitly demand a core profile
-	SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
-	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
-	SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1);
-	window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_OPENGL);
+#ifdef PANDA3DS_ENABLE_OPENGL
+	if (config.rendererType == RendererType::OpenGL) {
+		// Request OpenGL 4.1 Core (Max available on MacOS)
+		// MacOS gets mad if we don't explicitly demand a core profile
+		SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
+		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
+		SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1);
+		window = SDL_CreateWindow("Alber", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, width, height, SDL_WINDOW_OPENGL);
 
-	if (window == nullptr) {
-		Helpers::panic("Window creation failed: %s", SDL_GetError());
-	}
+		if (window == nullptr) {
+			Helpers::panic("Window creation failed: %s", SDL_GetError());
+		}
 
-	glContext = SDL_GL_CreateContext(window);
-	if (glContext == nullptr) {
-		Helpers::panic("OpenGL context creation failed: %s", SDL_GetError());
-	}
+		glContext = SDL_GL_CreateContext(window);
+		if (glContext == nullptr) {
+			Helpers::panic("OpenGL context creation failed: %s", SDL_GetError());
+		}
 
-	if (!gladLoadGL(reinterpret_cast<GLADloadfunc>(SDL_GL_GetProcAddress))) {
-		Helpers::panic("OpenGL init failed: %s", SDL_GetError());
+		if (!gladLoadGL(reinterpret_cast<GLADloadfunc>(SDL_GL_GetProcAddress))) {
+			Helpers::panic("OpenGL init failed: %s", SDL_GetError());
+		}
 	}
+#endif
 
 	if (SDL_WasInit(SDL_INIT_GAMECONTROLLER)) {
 		gameController = SDL_GameControllerOpen(0);
@@ -52,7 +60,6 @@ Emulator::Emulator() : kernel(cpu, memory, gpu), cpu(memory, kernel), gpu(memory
 		}
 	}
 
-	config.load(std::filesystem::current_path() / "config.toml");
 	reset(ReloadOption::NoReload);
 }
 
@@ -69,6 +76,12 @@ void Emulator::reset(ReloadOption reload) {
 	// Otherwise resetting the kernel or cpu might nuke them
 	cpu.setReg(13, VirtualAddrs::StackTop);  // Set initial SP
 
+	// We're resetting without reloading the ROM, so yeet cheats
+	if (reload == ReloadOption::NoReload) {
+		haveCheats = false;
+		cheats.reset();
+	}
+
 	// If a ROM is active and we reset, with the reload option enabled then reload it.
 	// This is necessary to set up stack, executable memory, .data/.rodata/.bss all over again
 	if (reload == ReloadOption::Reload && romType != ROMType::None && romPath.has_value()) {
@@ -91,19 +104,8 @@ void Emulator::run() {
 #endif
 
 	while (running) {
-		ServiceManager& srv = kernel.getServiceManager();
-
-		if (romType != ROMType::None) {
-#ifdef PANDA3DS_ENABLE_HTTP_SERVER
-			pollHttpServer();
-#endif
-			runFrame();     // Run 1 frame of instructions
-			gpu.display();  // Display graphics
-
-			// Send VBlank interrupts
-			srv.sendGPUInterrupt(GPUInterrupt::VBlank0);
-			srv.sendGPUInterrupt(GPUInterrupt::VBlank1);
-		}
+		runFrame();
+		HIDService& hid = kernel.getServiceManager().getHID();
 
 		SDL_Event event;
 		while (SDL_PollEvent(&event)) {
@@ -119,41 +121,41 @@ void Emulator::run() {
 					if (romType == ROMType::None) break;
 
 					switch (event.key.keysym.sym) {
-						case SDLK_l: srv.pressKey(Keys::A); break;
-						case SDLK_k: srv.pressKey(Keys::B); break;
-						case SDLK_o: srv.pressKey(Keys::X); break;
-						case SDLK_i: srv.pressKey(Keys::Y); break;
+						case SDLK_l: hid.pressKey(Keys::A); break;
+						case SDLK_k: hid.pressKey(Keys::B); break;
+						case SDLK_o: hid.pressKey(Keys::X); break;
+						case SDLK_i: hid.pressKey(Keys::Y); break;
 
-						case SDLK_q: srv.pressKey(Keys::L); break;
-						case SDLK_p: srv.pressKey(Keys::R); break;
+						case SDLK_q: hid.pressKey(Keys::L); break;
+						case SDLK_p: hid.pressKey(Keys::R); break;
 
-						case SDLK_RIGHT: srv.pressKey(Keys::Right); break;
-						case SDLK_LEFT: srv.pressKey(Keys::Left); break;
-						case SDLK_UP: srv.pressKey(Keys::Up); break;
-						case SDLK_DOWN: srv.pressKey(Keys::Down); break;
+						case SDLK_RIGHT: hid.pressKey(Keys::Right); break;
+						case SDLK_LEFT: hid.pressKey(Keys::Left); break;
+						case SDLK_UP: hid.pressKey(Keys::Up); break;
+						case SDLK_DOWN: hid.pressKey(Keys::Down); break;
 
 						case SDLK_w:
-							srv.setCirclepadY(0x9C);
+							hid.setCirclepadY(0x9C);
 							keyboardAnalogY = true;
 							break;
 
 						case SDLK_a:
-							srv.setCirclepadX(-0x9C);
+							hid.setCirclepadX(-0x9C);
 							keyboardAnalogX = true;
 							break;
 
 						case SDLK_s:
-							srv.setCirclepadY(-0x9C);
+							hid.setCirclepadY(-0x9C);
 							keyboardAnalogY = true;
 							break;
 
 						case SDLK_d:
-							srv.setCirclepadX(0x9C);
+							hid.setCirclepadX(0x9C);
 							keyboardAnalogX = true;
 							break;
 
-						case SDLK_RETURN: srv.pressKey(Keys::Start); break;
-						case SDLK_BACKSPACE: srv.pressKey(Keys::Select); break;
+						case SDLK_RETURN: hid.pressKey(Keys::Start); break;
+						case SDLK_BACKSPACE: hid.pressKey(Keys::Select); break;
 					}
 					break;
 
@@ -161,34 +163,34 @@ void Emulator::run() {
 					if (romType == ROMType::None) break;
 
 					switch (event.key.keysym.sym) {
-						case SDLK_l: srv.releaseKey(Keys::A); break;
-						case SDLK_k: srv.releaseKey(Keys::B); break;
-						case SDLK_o: srv.releaseKey(Keys::X); break;
-						case SDLK_i: srv.releaseKey(Keys::Y); break;
+						case SDLK_l: hid.releaseKey(Keys::A); break;
+						case SDLK_k: hid.releaseKey(Keys::B); break;
+						case SDLK_o: hid.releaseKey(Keys::X); break;
+						case SDLK_i: hid.releaseKey(Keys::Y); break;
 
-						case SDLK_q: srv.releaseKey(Keys::L); break;
-						case SDLK_p: srv.releaseKey(Keys::R); break;
+						case SDLK_q: hid.releaseKey(Keys::L); break;
+						case SDLK_p: hid.releaseKey(Keys::R); break;
 
-						case SDLK_RIGHT: srv.releaseKey(Keys::Right); break;
-						case SDLK_LEFT: srv.releaseKey(Keys::Left); break;
-						case SDLK_UP: srv.releaseKey(Keys::Up); break;
-						case SDLK_DOWN: srv.releaseKey(Keys::Down); break;
+						case SDLK_RIGHT: hid.releaseKey(Keys::Right); break;
+						case SDLK_LEFT: hid.releaseKey(Keys::Left); break;
+						case SDLK_UP: hid.releaseKey(Keys::Up); break;
+						case SDLK_DOWN: hid.releaseKey(Keys::Down); break;
 
 						// Err this is probably not ideal
 						case SDLK_w:
 						case SDLK_s:
-							srv.setCirclepadY(0);
+							hid.setCirclepadY(0);
 							keyboardAnalogY = false;
 							break;
 
 						case SDLK_a:
 						case SDLK_d:
-							srv.setCirclepadX(0);
+							hid.setCirclepadX(0);
 							keyboardAnalogX = false;
 							break;
 
-						case SDLK_RETURN: srv.releaseKey(Keys::Start); break;
-						case SDLK_BACKSPACE: srv.releaseKey(Keys::Select); break;
+						case SDLK_RETURN: hid.releaseKey(Keys::Start); break;
+						case SDLK_BACKSPACE: hid.releaseKey(Keys::Select); break;
 					}
 					break;
 
@@ -205,9 +207,9 @@ void Emulator::run() {
 							u16 x_converted = static_cast<u16>(x) - 40;
 							u16 y_converted = static_cast<u16>(y) - 240;
 
-							srv.setTouchScreenPress(x_converted, y_converted);
+							hid.setTouchScreenPress(x_converted, y_converted);
 						} else {
-							srv.releaseTouchScreen();
+							hid.releaseTouchScreen();
 						}
 					} else if (event.button.button == SDL_BUTTON_RIGHT) {
 						holdingRightClick = true;
@@ -219,7 +221,7 @@ void Emulator::run() {
 					if (romType == ROMType::None) break;
 
 					if (event.button.button == SDL_BUTTON_LEFT) {
-						srv.releaseTouchScreen();
+						hid.releaseTouchScreen();
 					} else if (event.button.button == SDL_BUTTON_RIGHT) {
 						holdingRightClick = false;
 					}
@@ -262,9 +264,9 @@ void Emulator::run() {
 
 					if (key != 0) {
 						if (event.cbutton.state == SDL_PRESSED) {
-							srv.pressKey(key);
+							hid.pressKey(key);
 						} else {
-							srv.releaseKey(key);
+							hid.releaseKey(key);
 						}
 					}
 					break;
@@ -283,8 +285,8 @@ void Emulator::run() {
 					// So up until then, we will set the gyroscope euler angles to fixed values based on the direction of the relative motion
 					const s32 roll = motionX > 0 ? 0x7f : -0x7f;
 					const s32 pitch = motionY > 0 ? 0x7f : -0x7f;
-					srv.setRoll(roll);
-					srv.setPitch(pitch);
+					hid.setRoll(roll);
+					hid.setPitch(pitch);
 					break;
 				}
 
@@ -311,19 +313,19 @@ void Emulator::run() {
 
 				// Avoid overriding the keyboard's circlepad input
 				if (abs(stickX) < deadzone && !keyboardAnalogX) {
-					srv.setCirclepadX(0);
+					hid.setCirclepadX(0);
 				} else {
-					srv.setCirclepadX(stickX / div);
+					hid.setCirclepadX(stickX / div);
 				}
 
 				if (abs(stickY) < deadzone && !keyboardAnalogY) {
-					srv.setCirclepadY(0);
+					hid.setCirclepadY(0);
 				} else {
-					srv.setCirclepadY(-(stickY / div));
+					hid.setCirclepadY(-(stickY / div));
 				}
 			}
 
-			srv.updateInputs(cpu.getTicks());
+			hid.updateInputs(cpu.getTicks());
 		}
 
 		// Update inputs in the HID module
@@ -331,7 +333,24 @@ void Emulator::run() {
 	}
 }
 
-void Emulator::runFrame() { cpu.runFrame(); }
+void Emulator::runFrame() {
+	if (romType != ROMType::None) {
+#ifdef PANDA3DS_ENABLE_HTTP_SERVER
+		pollHttpServer();
+#endif
+		cpu.runFrame(); // Run 1 frame of instructions
+		gpu.display();  // Display graphics
+
+		// Send VBlank interrupts
+		ServiceManager& srv = kernel.getServiceManager();
+		srv.sendGPUInterrupt(GPUInterrupt::VBlank0);
+		srv.sendGPUInterrupt(GPUInterrupt::VBlank1);
+
+		if (haveCheats) [[unlikely]] {
+			cheats.run();
+		}
+	}
+}
 
 bool Emulator::loadROM(const std::filesystem::path& path) {
 	// Reset the emulator if we've already loaded a ROM
@@ -427,15 +446,13 @@ bool Emulator::loadELF(std::ifstream& file) {
 }
 
 // Reset our graphics context and initialize the GPU's graphics context
-void Emulator::initGraphicsContext() {
-	gl.reset();  // TODO (For when we have multiple backends): Only do this if we are using OpenGL
-	gpu.initGraphicsContext();
-}
+void Emulator::initGraphicsContext() { gpu.initGraphicsContext(); }
 
 #ifdef PANDA3DS_ENABLE_HTTP_SERVER
 void Emulator::pollHttpServer() {
 	std::scoped_lock lock(httpServer.actionMutex);
-	ServiceManager& srv = kernel.getServiceManager();
+
+	HIDService& hid = kernel.getServiceManager().getHID();
 
 	if (httpServer.pendingAction) {
 		switch (httpServer.action) {
@@ -443,14 +460,14 @@ void Emulator::pollHttpServer() {
 
 			case HttpAction::PressKey:
 				if (httpServer.pendingKey != 0) {
-					srv.pressKey(httpServer.pendingKey);
+					hid.pressKey(httpServer.pendingKey);
 					httpServer.pendingKey = 0;
 				}
 				break;
 
 			case HttpAction::ReleaseKey:
 				if (httpServer.pendingKey != 0) {
-					srv.releaseKey(httpServer.pendingKey);
+					hid.releaseKey(httpServer.pendingKey);
 					httpServer.pendingKey = 0;
 				}
 				break;
diff --git a/src/host_shaders/opengl_display.frag b/src/host_shaders/opengl_display.frag
new file mode 100644
index 00000000..612671c8
--- /dev/null
+++ b/src/host_shaders/opengl_display.frag
@@ -0,0 +1,8 @@
+#version 410 core
+in vec2 UV;
+out vec4 FragColor;
+
+uniform sampler2D u_texture;
+void main() {
+	FragColor = texture(u_texture, UV);
+}
\ No newline at end of file
diff --git a/src/host_shaders/opengl_display.vert b/src/host_shaders/opengl_display.vert
new file mode 100644
index 00000000..990e2f80
--- /dev/null
+++ b/src/host_shaders/opengl_display.vert
@@ -0,0 +1,23 @@
+#version 410 core
+out vec2 UV;
+
+void main() {
+	const vec4 positions[4] = vec4[](
+		vec4(-1.0, 1.0, 1.0, 1.0),   // Top-left
+		vec4(1.0, 1.0, 1.0, 1.0),    // Top-right
+		vec4(-1.0, -1.0, 1.0, 1.0),  // Bottom-left
+		vec4(1.0, -1.0, 1.0, 1.0)    // Bottom-right
+	);
+
+	// The 3DS displays both screens' framebuffer rotated 90 deg counter clockwise
+	// So we adjust our texcoords accordingly
+	const vec2 texcoords[4] = vec2[](
+		vec2(1.0, 1.0),  // Top-right
+		vec2(1.0, 0.0),  // Bottom-right
+		vec2(0.0, 1.0),  // Top-left
+		vec2(0.0, 0.0)   // Bottom-left
+	);
+
+	gl_Position = positions[gl_VertexID];
+	UV = texcoords[gl_VertexID];
+}
\ No newline at end of file
diff --git a/src/host_shaders/opengl_fragment_shader.frag b/src/host_shaders/opengl_fragment_shader.frag
new file mode 100644
index 00000000..f6461094
--- /dev/null
+++ b/src/host_shaders/opengl_fragment_shader.frag
@@ -0,0 +1,409 @@
+#version 410 core
+
+in vec3 v_tangent;
+in vec3 v_normal;
+in vec3 v_bitangent;
+in vec4 v_colour;
+in vec3 v_texcoord0;
+in vec2 v_texcoord1;
+in vec3 v_view;
+in vec2 v_texcoord2;
+flat in vec4 v_textureEnvColor[6];
+flat in vec4 v_textureEnvBufferColor;
+
+out vec4 fragColour;
+
+// TEV uniforms
+uniform uint u_textureEnvSource[6];
+uniform uint u_textureEnvOperand[6];
+uniform uint u_textureEnvCombiner[6];
+uniform uint u_textureEnvScale[6];
+
+// Depth control uniforms
+uniform float u_depthScale;
+uniform float u_depthOffset;
+uniform bool u_depthmapEnable;
+
+uniform sampler2D u_tex0;
+uniform sampler2D u_tex1;
+uniform sampler2D u_tex2;
+uniform sampler1DArray u_tex_lighting_lut;
+
+uniform uint u_picaRegs[0x200 - 0x48];
+
+// Helper so that the implementation of u_pica_regs can be changed later
+uint readPicaReg(uint reg_addr) { return u_picaRegs[reg_addr - 0x48]; }
+
+vec4 tevSources[16];
+vec4 tevNextPreviousBuffer;
+bool tevUnimplementedSourceFlag = false;
+
+// OpenGL ES 1.1 reference pages for TEVs (this is what the PICA200 implements):
+// https://registry.khronos.org/OpenGL-Refpages/es1.1/xhtml/glTexEnv.xml
+
+vec4 tevFetchSource(uint src_id) {
+	if (src_id >= 6u && src_id < 13u) {
+		tevUnimplementedSourceFlag = true;
+	}
+
+	return tevSources[src_id];
+}
+
+vec4 tevGetColorAndAlphaSource(int tev_id, int src_id) {
+	vec4 result;
+
+	vec4 colorSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4)) & 15u);
+	vec4 alphaSource = tevFetchSource((u_textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u);
+
+	uint colorOperand = (u_textureEnvOperand[tev_id] >> (src_id * 4)) & 15u;
+	uint alphaOperand = (u_textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u;
+
+	// TODO: figure out what the undocumented values do
+	switch (colorOperand) {
+		case 0u: result.rgb = colorSource.rgb; break;             // Source color
+		case 1u: result.rgb = 1.0 - colorSource.rgb; break;       // One minus source color
+		case 2u: result.rgb = vec3(colorSource.a); break;         // Source alpha
+		case 3u: result.rgb = vec3(1.0 - colorSource.a); break;   // One minus source alpha
+		case 4u: result.rgb = vec3(colorSource.r); break;         // Source red
+		case 5u: result.rgb = vec3(1.0 - colorSource.r); break;   // One minus source red
+		case 8u: result.rgb = vec3(colorSource.g); break;         // Source green
+		case 9u: result.rgb = vec3(1.0 - colorSource.g); break;   // One minus source green
+		case 12u: result.rgb = vec3(colorSource.b); break;        // Source blue
+		case 13u: result.rgb = vec3(1.0 - colorSource.b); break;  // One minus source blue
+		default: break;
+	}
+
+	// TODO: figure out what the undocumented values do
+	switch (alphaOperand) {
+		case 0u: result.a = alphaSource.a; break;        // Source alpha
+		case 1u: result.a = 1.0 - alphaSource.a; break;  // One minus source alpha
+		case 2u: result.a = alphaSource.r; break;        // Source red
+		case 3u: result.a = 1.0 - alphaSource.r; break;  // One minus source red
+		case 4u: result.a = alphaSource.g; break;        // Source green
+		case 5u: result.a = 1.0 - alphaSource.g; break;  // One minus source green
+		case 6u: result.a = alphaSource.b; break;        // Source blue
+		case 7u: result.a = 1.0 - alphaSource.b; break;  // One minus source blue
+		default: break;
+	}
+
+	return result;
+}
+
+vec4 tevCalculateCombiner(int tev_id) {
+	vec4 source0 = tevGetColorAndAlphaSource(tev_id, 0);
+	vec4 source1 = tevGetColorAndAlphaSource(tev_id, 1);
+	vec4 source2 = tevGetColorAndAlphaSource(tev_id, 2);
+
+	uint colorCombine = u_textureEnvCombiner[tev_id] & 15u;
+	uint alphaCombine = (u_textureEnvCombiner[tev_id] >> 16) & 15u;
+
+	vec4 result = vec4(1.0);
+
+	// TODO: figure out what the undocumented values do
+	switch (colorCombine) {
+		case 0u: result.rgb = source0.rgb; break;                                            // Replace
+		case 1u: result.rgb = source0.rgb * source1.rgb; break;                              // Modulate
+		case 2u: result.rgb = min(vec3(1.0), source0.rgb + source1.rgb); break;              // Add
+		case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break;       // Add signed
+		case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break;             // Interpolate
+		case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break;                    // Subtract
+		case 6u: result.rgb = vec3(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break;  // Dot3 RGB
+		case 7u: result = vec4(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break;      // Dot3 RGBA
+		case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break;      // Multiply then add
+		case 9u: result.rgb = min((source0.rgb + source1.rgb) * source2.rgb, 1.0); break;    // Add then multiply
+		default: break;
+	}
+
+	if (colorCombine != 7u) {  // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode.
+		// TODO: figure out what the undocumented values do
+		// TODO: test if the alpha combiner supports all the same modes as the color combiner.
+		switch (alphaCombine) {
+			case 0u: result.a = source0.a; break;                                      // Replace
+			case 1u: result.a = source0.a * source1.a; break;                          // Modulate
+			case 2u: result.a = min(1.0, source0.a + source1.a); break;                // Add
+			case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break;   // Add signed
+			case 4u: result.a = mix(source1.a, source0.a, source2.a); break;           // Interpolate
+			case 5u: result.a = max(0.0, source0.a - source1.a); break;                // Subtract
+			case 8u: result.a = min(1.0, source0.a * source1.a + source2.a); break;    // Multiply then add
+			case 9u: result.a = min(1.0, (source0.a + source1.a) * source2.a); break;  // Add then multiply
+			default: break;
+		}
+	}
+
+	result.rgb *= float(1 << (u_textureEnvScale[tev_id] & 3u));
+	result.a *= float(1 << ((u_textureEnvScale[tev_id] >> 16) & 3u));
+
+	return result;
+}
+
+#define D0_LUT 0u
+#define D1_LUT 1u
+#define SP_LUT 2u
+#define FR_LUT 3u
+#define RB_LUT 4u
+#define RG_LUT 5u
+#define RR_LUT 6u
+
+float lutLookup(uint lut, uint light, float value) {
+	if (lut >= FR_LUT && lut <= RR_LUT) lut -= 1;
+	if (lut == SP_LUT) lut = light + 8;
+	return texture(u_tex_lighting_lut, vec2(value, lut)).r;
+}
+
+vec3 regToColor(uint reg) {
+	// Normalization scale to convert from [0...255] to [0.0...1.0]
+	const float scale = 1.0 / 255.0;
+
+	return scale * vec3(float(bitfieldExtract(reg, 20, 8)), float(bitfieldExtract(reg, 10, 8)), float(bitfieldExtract(reg, 00, 8)));
+}
+
+// Convert an arbitrary-width floating point literal to an f32
+float decodeFP(uint hex, uint E, uint M) {
+	uint width = M + E + 1u;
+	uint bias = 128u - (1u << (E - 1u));
+	uint exponent = (hex >> M) & ((1u << E) - 1u);
+	uint mantissa = hex & ((1u << M) - 1u);
+	uint sign = (hex >> (E + M)) << 31u;
+
+	if ((hex & ((1u << (width - 1u)) - 1u)) != 0) {
+		if (exponent == (1u << E) - 1u)
+			exponent = 255u;
+		else
+			exponent += bias;
+		hex = sign | (mantissa << (23u - M)) | (exponent << 23u);
+	} else {
+		hex = sign;
+	}
+
+	return uintBitsToFloat(hex);
+}
+
+// Implements the following algorthm: https://mathb.in/26766
+void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+	// Quaternions describe a transformation from surface-local space to eye space.
+	// In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1),
+	// the tangent vector is (1,0,0), and the bitangent vector is (0,1,0).
+	vec3 normal = normalize(v_normal);
+	vec3 tangent = normalize(v_tangent);
+	vec3 bitangent = normalize(v_bitangent);
+	vec3 view = normalize(v_view);
+
+	uint GPUREG_LIGHTING_ENABLE = readPicaReg(0x008F);
+	if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0) {
+		primary_color = secondary_color = vec4(1.0);
+		return;
+	}
+
+	uint GPUREG_LIGHTING_AMBIENT = readPicaReg(0x01C0);
+	uint GPUREG_LIGHTING_NUM_LIGHTS = (readPicaReg(0x01C2) & 0x7u) + 1;
+	uint GPUREG_LIGHTING_LIGHT_PERMUTATION = readPicaReg(0x01D9);
+
+	primary_color = vec4(vec3(0.0), 1.0);
+	secondary_color = vec4(vec3(0.0), 1.0);
+
+	primary_color.rgb += regToColor(GPUREG_LIGHTING_AMBIENT);
+
+	uint GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0);
+	uint GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1);
+	uint GPUREG_LIGHTING_CONFIG0 = readPicaReg(0x01C3);
+	uint GPUREG_LIGHTING_CONFIG1 = readPicaReg(0x01C4);
+	uint GPUREG_LIGHTING_LUTINPUT_SCALE = readPicaReg(0x01D2);
+	float d[7];
+
+	bool error_unimpl = false;
+
+	for (uint i = 0; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) {
+		uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i * 3), 3);
+
+		uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_DIFFUSE = readPicaReg(0x0142 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_AMBIENT = readPicaReg(0x0143 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_VECTOR_LOW = readPicaReg(0x0144 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_VECTOR_HIGH = readPicaReg(0x0145 + 0x10 * light_id);
+		uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149 + 0x10 * light_id);
+
+		vec3 light_vector = normalize(vec3(
+			decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5, 10), decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5, 10),
+			decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10)
+		));
+
+		// Positional Light
+		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) error_unimpl = true;
+
+		vec3 half_vector = normalize(normalize(light_vector) + view);
+
+		for (int c = 0; c < 7; c++) {
+			if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0) {
+				uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3);
+				float scale = float(1u << scale_id);
+				if (scale_id >= 6u) scale /= 256.0;
+
+				uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3);
+				if (input_id == 0u)
+					d[c] = dot(normal, half_vector);
+				else if (input_id == 1u)
+					d[c] = dot(view, half_vector);
+				else if (input_id == 2u)
+					d[c] = dot(normal, view);
+				else if (input_id == 3u)
+					d[c] = dot(light_vector, normal);
+				else if (input_id == 4u) {
+					uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146 + 0x10 * light_id);
+					uint GPUREG_LIGHTi_SPOTDIR_HIGH = readPicaReg(0x0147 + 0x10 * light_id);
+					vec3 spot_light_vector = normalize(vec3(
+						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1, 11),
+						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1, 11),
+						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1, 11)
+					));
+					d[c] = dot(-light_vector, spot_light_vector);  // -L dot P (aka Spotlight aka SP);
+				} else if (input_id == 5u) {
+					d[c] = 1.0;  // TODO: cos <greek symbol> (aka CP);
+					error_unimpl = true;
+				} else {
+					d[c] = 1.0;
+				}
+
+				d[c] = lutLookup(c, light_id, d[c] * 0.5 + 0.5) * scale;
+				if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) d[c] = abs(d[c]);
+			} else {
+				d[c] = 1.0;
+			}
+		}
+
+		uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG, 4, 4);
+		if (lookup_config == 0) {
+			d[D1_LUT] = 0.0;
+			d[FR_LUT] = 0.0;
+			d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
+		} else if (lookup_config == 1) {
+			d[D0_LUT] = 0.0;
+			d[D1_LUT] = 0.0;
+			d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
+		} else if (lookup_config == 2) {
+			d[FR_LUT] = 0.0;
+			d[SP_LUT] = 0.0;
+			d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
+		} else if (lookup_config == 3) {
+			d[SP_LUT] = 0.0;
+			d[RG_LUT] = d[RB_LUT] = d[RR_LUT] = 1.0;
+		} else if (lookup_config == 4) {
+			d[FR_LUT] = 0.0;
+		} else if (lookup_config == 5) {
+			d[D1_LUT] = 0.0;
+		} else if (lookup_config == 6) {
+			d[RG_LUT] = d[RB_LUT] = d[RR_LUT];
+		}
+
+		float distance_factor = 1.0;  // a
+		float indirect_factor = 1.0;  // fi
+		float shadow_factor = 1.0;    // o
+
+		float NdotL = dot(normal, light_vector);  // Li dot N
+
+		// Two sided diffuse
+		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0)
+			NdotL = max(0.0, NdotL);
+		else
+			NdotL = abs(NdotL);
+
+		float light_factor = distance_factor * d[SP_LUT] * indirect_factor * shadow_factor;
+
+		primary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE) * NdotL);
+		secondary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] +
+											   regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT]));
+	}
+	uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1);
+	uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1);
+
+	if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT];
+	if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT];
+
+	if (error_unimpl) {
+		secondary_color = primary_color = vec4(1.0, 0., 1.0, 1.0);
+	}
+}
+
+void main() {
+	// TODO: what do invalid sources and disabled textures read as?
+	// And what does the "previous combiner" source read initially?
+	tevSources[0] = v_colour;  // Primary/vertex color
+	calcLighting(tevSources[1], tevSources[2]);
+
+	uint textureConfig = readPicaReg(0x80);
+	vec2 tex2UV = (textureConfig & (1u << 13)) != 0u ? v_texcoord1 : v_texcoord2;
+
+	if ((textureConfig & 1u) != 0u) tevSources[3] = texture(u_tex0, v_texcoord0.xy);
+	if ((textureConfig & 2u) != 0u) tevSources[4] = texture(u_tex1, v_texcoord1);
+	if ((textureConfig & 4u) != 0u) tevSources[5] = texture(u_tex2, tex2UV);
+	tevSources[13] = vec4(0.0);  // Previous buffer
+	tevSources[15] = vec4(0.0);  // Previous combiner
+
+	tevNextPreviousBuffer = v_textureEnvBufferColor;
+	uint textureEnvUpdateBuffer = readPicaReg(0xE0);
+
+	for (int i = 0; i < 6; i++) {
+		tevSources[14] = v_textureEnvColor[i];  // Constant color
+		tevSources[15] = tevCalculateCombiner(i);
+		tevSources[13] = tevNextPreviousBuffer;
+
+		if (i < 4) {
+			if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) {
+				tevNextPreviousBuffer.rgb = tevSources[15].rgb;
+			}
+
+			if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) {
+				tevNextPreviousBuffer.a = tevSources[15].a;
+			}
+		}
+	}
+
+	fragColour = tevSources[15];
+
+	if (tevUnimplementedSourceFlag) {
+		// fragColour = vec4(1.0, 0.0, 1.0, 1.0);
+	}
+	// fragColour.rg = texture(u_tex_lighting_lut,vec2(gl_FragCoord.x/200.,float(int(gl_FragCoord.y/2)%24))).rr;
+
+	// Get original depth value by converting from [near, far] = [0, 1] to [-1, 1]
+	// We do this by converting to [0, 2] first and subtracting 1 to go to [-1, 1]
+	float z_over_w = gl_FragCoord.z * 2.0f - 1.0f;
+	float depth = z_over_w * u_depthScale + u_depthOffset;
+
+	if (!u_depthmapEnable)  // Divide z by w if depthmap enable == 0 (ie using W-buffering)
+		depth /= gl_FragCoord.w;
+
+	// Write final fragment depth
+	gl_FragDepth = depth;
+
+	// Perform alpha test
+	uint alphaControl = readPicaReg(0x104);
+	if ((alphaControl & 1u) != 0u) {  // Check if alpha test is on
+		uint func = (alphaControl >> 4u) & 7u;
+		float reference = float((alphaControl >> 8u) & 0xffu) / 255.0;
+		float alpha = fragColour.a;
+
+		switch (func) {
+			case 0: discard;  // Never pass alpha test
+			case 1: break;    // Always pass alpha test
+			case 2:           // Pass if equal
+				if (alpha != reference) discard;
+				break;
+			case 3:  // Pass if not equal
+				if (alpha == reference) discard;
+				break;
+			case 4:  // Pass if less than
+				if (alpha >= reference) discard;
+				break;
+			case 5:  // Pass if less than or equal
+				if (alpha > reference) discard;
+				break;
+			case 6:  // Pass if greater than
+				if (alpha <= reference) discard;
+				break;
+			case 7:  // Pass if greater than or equal
+				if (alpha < reference) discard;
+				break;
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/host_shaders/opengl_vertex_shader.vert b/src/host_shaders/opengl_vertex_shader.vert
new file mode 100644
index 00000000..cbf992c4
--- /dev/null
+++ b/src/host_shaders/opengl_vertex_shader.vert
@@ -0,0 +1,97 @@
+#version 410 core
+
+layout(location = 0) in vec4 a_coords;
+layout(location = 1) in vec4 a_quaternion;
+layout(location = 2) in vec4 a_vertexColour;
+layout(location = 3) in vec2 a_texcoord0;
+layout(location = 4) in vec2 a_texcoord1;
+layout(location = 5) in float a_texcoord0_w;
+layout(location = 6) in vec3 a_view;
+layout(location = 7) in vec2 a_texcoord2;
+
+out vec3 v_normal;
+out vec3 v_tangent;
+out vec3 v_bitangent;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+flat out vec4 v_textureEnvColor[6];
+flat out vec4 v_textureEnvBufferColor;
+
+out float gl_ClipDistance[2];
+
+// TEV uniforms
+uniform uint u_textureEnvColor[6];
+uniform uint u_picaRegs[0x200 - 0x48];
+
+// Helper so that the implementation of u_pica_regs can be changed later
+uint readPicaReg(uint reg_addr) { return u_picaRegs[reg_addr - 0x48]; }
+
+vec4 abgr8888ToVec4(uint abgr) {
+	const float scale = 1.0 / 255.0;
+
+	return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24));
+}
+
+vec3 rotateVec3ByQuaternion(vec3 v, vec4 q) {
+	vec3 u = q.xyz;
+	float s = q.w;
+	return 2.0 * dot(u, v) * u + (s * s - dot(u, u)) * v + 2.0 * s * cross(u, v);
+}
+
+// Convert an arbitrary-width floating point literal to an f32
+float decodeFP(uint hex, uint E, uint M) {
+	uint width = M + E + 1u;
+	uint bias = 128u - (1u << (E - 1u));
+	uint exponent = (hex >> M) & ((1u << E) - 1u);
+	uint mantissa = hex & ((1u << M) - 1u);
+	uint sign = (hex >> (E + M)) << 31u;
+
+	if ((hex & ((1u << (width - 1u)) - 1u)) != 0) {
+		if (exponent == (1u << E) - 1u)
+			exponent = 255u;
+		else
+			exponent += bias;
+		hex = sign | (mantissa << (23u - M)) | (exponent << 23u);
+	} else {
+		hex = sign;
+	}
+
+	return uintBitsToFloat(hex);
+}
+
+void main() {
+	gl_Position = a_coords;
+	v_colour = a_vertexColour;
+
+	// Flip y axis of UVs because OpenGL uses an inverted y for texture sampling compared to the PICA
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+
+	v_normal = normalize(rotateVec3ByQuaternion(vec3(0.0, 0.0, 1.0), a_quaternion));
+	v_tangent = normalize(rotateVec3ByQuaternion(vec3(1.0, 0.0, 0.0), a_quaternion));
+	v_bitangent = normalize(rotateVec3ByQuaternion(vec3(0.0, 1.0, 0.0), a_quaternion));
+
+	for (int i = 0; i < 6; i++) {
+		v_textureEnvColor[i] = abgr8888ToVec4(u_textureEnvColor[i]);
+	}
+
+	v_textureEnvBufferColor = abgr8888ToVec4(readPicaReg(0xFD));
+
+	// Parse clipping plane registers
+	// The plane registers describe a clipping plane in the form of Ax + By + Cz + D = 0
+	// With n = (A, B, C) being the normal vector and D being the origin point distance
+	// Therefore, for the second clipping plane, we can just pass the dot product of the clip vector and the input coordinates to gl_ClipDistance[1]
+	vec4 clipData = vec4(
+		decodeFP(readPicaReg(0x48) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x49) & 0xffffffu, 7, 16),
+		decodeFP(readPicaReg(0x4A) & 0xffffffu, 7, 16), decodeFP(readPicaReg(0x4B) & 0xffffffu, 7, 16)
+	);
+
+	// There's also another, always-on clipping plane based on vertex z
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipData, a_coords);
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 1559565a..66a04b9e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,9 +1,9 @@
 #include "emulator.hpp"
 
-int main (int argc, char *argv[]) {
-    Emulator emu;
+int main(int argc, char *argv[]) {
+	Emulator emu;
 
-    emu.initGraphicsContext();
+	emu.initGraphicsContext();
 
 	if (argc > 1) {
 		auto romPath = std::filesystem::current_path() / argv[1];
diff --git a/src/renderer.cpp b/src/renderer.cpp
new file mode 100644
index 00000000..3ba29aea
--- /dev/null
+++ b/src/renderer.cpp
@@ -0,0 +1,35 @@
+#include "renderer.hpp"
+
+#include <algorithm>
+#include <unordered_map>
+
+Renderer::Renderer(GPU& gpu, const std::array<u32, regNum>& internalRegs) : gpu(gpu), regs(internalRegs) {}
+Renderer::~Renderer() {}
+
+std::optional<RendererType> Renderer::typeFromString(std::string inString) {
+	// Transform to lower-case to make the setting case-insensitive
+	std::transform(inString.begin(), inString.end(), inString.begin(), [](unsigned char c) { return std::tolower(c); });
+
+	// Huge table of possible names and misspellings
+	// Please stop misspelling Vulkan as Vulcan
+	static const std::unordered_map<std::string, RendererType> map = {
+		{"null", RendererType::Null}, {"nil", RendererType::Null},      {"none", RendererType::Null},
+		{"gl", RendererType::OpenGL}, {"ogl", RendererType::OpenGL},    {"opengl", RendererType::OpenGL},
+		{"vk", RendererType::Vulkan}, {"vulkan", RendererType::Vulkan}, {"vulcan", RendererType::Vulkan},
+	};
+
+	if (auto search = map.find(inString); search != map.end()) {
+		return search->second;
+	}
+
+	return std::nullopt;
+}
+
+const char* Renderer::typeToString(RendererType rendererType) {
+	switch (rendererType) {
+		case RendererType::Null: return "null";
+		case RendererType::OpenGL: return "opengl";
+		case RendererType::Vulkan: return "vulkan";
+		default: return "Invalid";
+	}
+}
\ No newline at end of file
diff --git a/third_party/cmrc b/third_party/cmrc
new file mode 160000
index 00000000..9a339644
--- /dev/null
+++ b/third_party/cmrc
@@ -0,0 +1 @@
+Subproject commit 9a3396444e0478bd6f261075e74d1ecf70964029