diff --git a/.github/workflows/Android_Build.yml b/.github/workflows/Android_Build.yml
index 11811f8b..b7e64f5f 100644
--- a/.github/workflows/Android_Build.yml
+++ b/.github/workflows/Android_Build.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     strategy:
       matrix:
@@ -73,7 +73,7 @@ jobs:
           ./src/pandroid/app/build/outputs/apk/${{ env.BUILD_TYPE }}/app-${{ env.BUILD_TYPE }}.apk
 
   arm64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     strategy:
       matrix:
diff --git a/.github/workflows/HTTP_Build.yml b/.github/workflows/HTTP_Build.yml
index 0bdaa4f7..c4f7cfee 100644
--- a/.github/workflows/HTTP_Build.yml
+++ b/.github/workflows/HTTP_Build.yml
@@ -16,7 +16,7 @@ jobs:
     # well on Windows or Mac.  You can convert this to a matrix build if you need
     # cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/Hydra_Build.yml b/.github/workflows/Hydra_Build.yml
index e2c2004b..785e0e4a 100644
--- a/.github/workflows/Hydra_Build.yml
+++ b/.github/workflows/Hydra_Build.yml
@@ -98,7 +98,7 @@ jobs:
           ${{github.workspace}}/docs/libretro/panda3ds_libretro.info
 
   Linux:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
@@ -107,7 +107,7 @@ jobs:
 
     - name: Install misc packages
       run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
       
     - name: Install newer Clang
       run: |
@@ -151,7 +151,7 @@ jobs:
           ${{github.workspace}}/docs/libretro/panda3ds_libretro.info
 
   Android-x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
@@ -160,7 +160,7 @@ jobs:
 
     - name: Install misc packages
       run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
        
     - name: Setup Vulkan SDK
       uses: humbletim/setup-vulkan-sdk@v1.2.0
diff --git a/.github/workflows/Linux_AppImage_Build.yml b/.github/workflows/Linux_AppImage_Build.yml
index f32a7d38..51c4a933 100644
--- a/.github/workflows/Linux_AppImage_Build.yml
+++ b/.github/workflows/Linux_AppImage_Build.yml
@@ -16,7 +16,7 @@ jobs:
     # well on Windows or Mac.  You can convert this to a matrix build if you need
     # cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
@@ -24,7 +24,7 @@ jobs:
       run: git submodule update --init --recursive
 
     - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2
 
     - name: Install newer Clang
       run: |
diff --git a/.github/workflows/Linux_Build.yml b/.github/workflows/Linux_Build.yml
index 9cb05303..dfcb6954 100644
--- a/.github/workflows/Linux_Build.yml
+++ b/.github/workflows/Linux_Build.yml
@@ -16,7 +16,7 @@ jobs:
     # well on Windows or Mac.  You can convert this to a matrix build if you need
     # cross-platform coverage.
     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
@@ -24,7 +24,7 @@ jobs:
       run: git submodule update --init --recursive
 
     - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev
 
     - name: Install newer Clang
       run: |
diff --git a/.github/workflows/Qt_Build.yml b/.github/workflows/Qt_Build.yml
index d3a09866..3b846a27 100644
--- a/.github/workflows/Qt_Build.yml
+++ b/.github/workflows/Qt_Build.yml
@@ -96,7 +96,7 @@ jobs:
         path: 'Alber.zip'
 
   Linux:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
 
     steps:
     - uses: actions/checkout@v4
@@ -105,8 +105,7 @@ jobs:
 
     - name: Install misc packages
       run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
-       sudo add-apt-repository -y ppa:savoury1/qt-6-2
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
        sudo apt update
        sudo apt install qt6-base-dev qt6-base-private-dev
 
diff --git a/.gitmodules b/.gitmodules
index 97bc129c..f1a70f41 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -76,6 +76,9 @@
 [submodule "third_party/metal-cpp"]
 	path = third_party/metal-cpp
 	url = https://github.com/Panda3DS-emu/metal-cpp
+[submodule "third_party/fmt"]
+	path = third_party/fmt
+	url = https://github.com/fmtlib/fmt
 [submodule "third_party/fdk-aac"]
 	path = third_party/fdk-aac
 	url = https://github.com/Panda3DS-emu/fdk-aac/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d25973c8..f4808366 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if(APPLE)
 endif()
 
 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security -Wno-invalid-offsetof")
 endif()
 
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -55,6 +55,7 @@ option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON)
 option(BUILD_HYDRA_CORE "Build a Hydra core" OFF)
 option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF)
 option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON)
+option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF)
 
 set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.")
 set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES)
@@ -147,11 +148,13 @@ if (NOT ANDROID)
     target_link_libraries(AlberCore PUBLIC SDL2-static)
 endif()
 
+add_subdirectory(third_party/fmt)
 add_subdirectory(third_party/toml11)
 include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 include_directories(third_party/glm)
 include_directories(third_party/renderdoc)
+include_directories(third_party/duckstation)
 
 add_subdirectory(third_party/cmrc)
 
@@ -210,6 +213,13 @@ else()
     set(HOST_ARM64 FALSE)
 endif()
 
+# Enable SSE4.1 if it's not explicitly disabled
+# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+endif()
+
 if(ENABLE_RENDERDOC_API)
     find_package(RenderDoc 1.6.0 MODULE REQUIRED)
     add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC)
@@ -258,13 +268,13 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
                          src/core/services/act.cpp src/core/services/nfc.cpp src/core/services/dlp_srvr.cpp
                          src/core/services/ir_user.cpp src/core/services/http.cpp src/core/services/soc.cpp
                          src/core/services/ssl.cpp src/core/services/news_u.cpp src/core/services/amiibo_device.cpp
-                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp
+                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp src/core/services/fonts.cpp
 )
 set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
                       src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
                       src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
                       src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
-                      src/core/PICA/shader_decompiler.cpp
+                      src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
 )
 
 set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@@ -316,14 +326,15 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
-                 include/sdl_sensors.hpp include/renderdoc.hpp include/audio/aac_decoder.hpp
+                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
+                 include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
 )
 
 cmrc_add_resource_library(
     resources_console_fonts
     NAMESPACE ConsoleFonts
     WHENCE "src/core/services/fonts/"
-    "src/core/services/fonts/CitraSharedFontUSRelocated.bin"
+    "src/core/services/fonts/SharedFontReplacement.bin"
 )
 
 set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
@@ -349,7 +360,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
 endif()
 
 if(ENABLE_QT_GUI)
-    include_directories(third_party/duckstation)
     set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)
 
     if(APPLE)
@@ -382,7 +392,7 @@ if(ENABLE_OPENGL)
     set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
         include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
         include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/gl_driver.hpp
     )
 
     set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
@@ -392,6 +402,8 @@ if(ENABLE_OPENGL)
         src/host_shaders/opengl_fragment_shader.frag
     )
 
+    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
+
     set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
     source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
 
@@ -555,7 +567,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN
 target_sources(AlberCore PRIVATE ${ALL_SOURCES})
 
 target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra fdk-aac)
-target_link_libraries(AlberCore PUBLIC glad capstone)
+target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)
 
 if(ENABLE_DISCORD_RPC AND NOT ANDROID)
     target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")
diff --git a/docs/img/KirbyAndroid.png b/docs/img/KirbyAndroid.png
new file mode 100644
index 00000000..05e8b466
Binary files /dev/null and b/docs/img/KirbyAndroid.png differ
diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
new file mode 100644
index 00000000..6a66cdc1
--- /dev/null
+++ b/include/PICA/draw_acceleration.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <array>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+			u32 stride;
+
+			u8 type;
+			u8 componentCount;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
+		u8* indexBuffer;
+
+		// Minimum and maximum index in the index buffer for a draw call
+		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
+		u32 totalLoaderCount;
+		u32 enabledAttributeMask;
+		u32 fixedAttributes;
+		u32 vertexDataSize;
+
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;
+
+		bool canBeAccelerated;
+		bool indexed;
+		bool useShortIndices;
+	};
+}  // namespace PICA
\ No newline at end of file
diff --git a/include/PICA/dynapica/pica_recs.hpp b/include/PICA/dynapica/pica_recs.hpp
index acfd226e..eb0cf404 100644
--- a/include/PICA/dynapica/pica_recs.hpp
+++ b/include/PICA/dynapica/pica_recs.hpp
@@ -2,7 +2,7 @@
 #include "helpers.hpp"
 #include "vertex_loader_rec.hpp"
 
-// Common file for our PICA JITs (From vertex config -> CPU assembly and from PICA shader -> CPU assembly)
+// Common file for our PICA JITs (From PICA shader -> CPU assembly)
 
 namespace Dynapica {
 #ifdef PANDA3DS_DYNAPICA_SUPPORTED
diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp
index ac2a49e6..c168a9bf 100644
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <array>
 
+#include "PICA/draw_acceleration.hpp"
 #include "PICA/dynapica/shader_rec.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_vertex.hpp"
@@ -13,6 +14,12 @@
 #include "memory.hpp"
 #include "renderer.hpp"
 
+enum class ShaderExecMode {
+	Interpreter,  // Interpret shaders on the CPU
+	JIT,          // Recompile shaders to CPU machine code
+	Hardware,     // Recompiler shaders to host shaders and run them on the GPU
+};
+
 class GPU {
 	static constexpr u32 regNum = 0x300;
 	static constexpr u32 extRegNum = 0x1000;
@@ -45,7 +52,7 @@ class GPU {
 	uint immediateModeVertIndex;
 	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading
 
-	template <bool indexed, bool useShaderJIT>
+	template <bool indexed, ShaderExecMode mode>
 	void drawArrays();
 
 	// Silly method of avoiding linking problems. TODO: Change to something less silly
@@ -81,6 +88,7 @@ class GPU {
 	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();
 
+	void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
   public:
 	// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
 	// Encoded in PICA native format
diff --git a/include/PICA/pica_frag_config.hpp b/include/PICA/pica_frag_config.hpp
index 5d5f8420..7b63a7b5 100644
--- a/include/PICA/pica_frag_config.hpp
+++ b/include/PICA/pica_frag_config.hpp
@@ -17,6 +17,7 @@ namespace PICA {
 			// enable == off means a CompareFunction of Always
 			BitField<0, 3, CompareFunction> alphaTestFunction;
 			BitField<3, 1, u32> depthMapEnable;
+			BitField<4, 4, LogicOpMode> logicOpMode;
 		};
 	};
 
@@ -214,6 +215,10 @@ namespace PICA {
 				(alphaTestConfig & 1) ? static_cast<PICA::CompareFunction>(alphaTestFunction) : PICA::CompareFunction::Always;
 			outConfig.depthMapEnable = regs[InternalRegs::DepthmapEnable] & 1;
 
+			// Shows if blending is enabled. If it is not enabled, then logic ops are enabled instead
+			const bool blendingEnabled = (regs[InternalRegs::ColourOperation] & (1 << 8)) != 0;
+			outConfig.logicOpMode = blendingEnabled ? LogicOpMode::Copy : LogicOpMode(Helpers::getBits<0, 4>(regs[InternalRegs::LogicOp]));
+
 			texConfig.texUnitConfig = regs[InternalRegs::TexUnitCfg];
 			texConfig.texEnvUpdateBuffer = regs[InternalRegs::TexEnvUpdateBuffer];
 
diff --git a/include/PICA/pica_simd.hpp b/include/PICA/pica_simd.hpp
new file mode 100644
index 00000000..ae7d04eb
--- /dev/null
+++ b/include/PICA/pica_simd.hpp
@@ -0,0 +1,274 @@
+#pragma once
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include "helpers.hpp"
+
+#if defined(_M_AMD64) || defined(__x86_64__)
+#define PICA_SIMD_X64
+#include <immintrin.h>
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define PICA_SIMD_ARM64
+#include <arm_neon.h>
+#endif
+
+// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
+namespace PICA::IndexBuffer {
+	// Non-SIMD, portable algorithm
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+
+#ifdef PICA_SIMD_ARM64
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// 16-bit indices
+			uint16x8_t minima = vdupq_n_u16(0xffff);
+			uint16x8_t maxima = vdupq_n_u16(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
+				minima = vminq_u16(data, minima);
+				maxima = vmaxq_u16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
+			// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
+			// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
+			// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
+
+			uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
+			uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
+			uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
+
+			uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
+			uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
+			uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
+
+			minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
+			maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
+		} else {
+			// 8-bit indices
+			uint8x16_t minima = vdupq_n_u8(0xff);
+			uint8x16_t maxima = vdupq_n_u8(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				uint8x16_t data = vld1q_u8(indexBuffer);
+				minima = vminq_u8(data, minima);
+				maxima = vmaxq_u8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
+			uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
+			uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
+			uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
+			uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
+
+			uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
+			uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
+			uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
+			uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
+
+			minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
+			maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
+		}
+
+		// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16
+		// indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
+
+			auto horizontalMax16 = [](__m128i vector) -> u16 {
+				// We have an instruction to compute horizontal minimum but not maximum, so we use it.
+				// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
+				__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
+				u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
+				return u16(min ^ 0xffff);
+			};
+
+			// 16-bit indices
+			// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
+			// And the maxima vector to all 0s (0 for each 16-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu16(data, minima);
+				maxima = _mm_max_epu16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin16(minima));
+			maximumIndex = u16(horizontalMax16(maxima));
+		} else {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin8 = [](__m128i vector) -> u8 {
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			auto horizontalMax8 = [](__m128i vector) -> u8 {
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			// 8-bit indices
+			// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
+			// And the maxima vector to all 0s (0 for each 8-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu8(data, minima);
+				maxima = _mm_max_epu8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin8(minima));
+			maximumIndex = u16(horizontalMax8(maxima));
+		}
+
+		// If any indices could not be processed cause the buffer size
+		// is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index
+		// buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+	// Analyzes a PICA index buffer to get the minimum and maximum indices in the
+	// buffer, and returns them in a pair in the form [min, max]. Takes a template
+	// parameter to decide whether the indices in the buffer are u8 or u16
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
+#if defined(PICA_SIMD_ARM64)
+		return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
+#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+		// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
+		return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
+#else
+		return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+#endif
+	}
+
+	// In some really unfortunate scenarios (eg Android Studio emulator), we don't have access to glDrawRangeElementsBaseVertex
+	// So we need to subtract the base vertex index from every index in the index buffer ourselves
+	// This is not really common, so we do it without SIMD for the moment, just to be able to run on Android Studio
+	template <bool useShortIndices>
+	void subtractBaseIndex(u8* indexBuffer, u32 indexCount, u16 baseIndex) {
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer16[i] -= baseIndex;
+			}
+		} else {
+			u8 baseIndex8 = u8(baseIndex);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer[i] -= baseIndex8;
+			}
+		}
+	}
+}  // namespace PICA::IndexBuffer
diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
new file mode 100644
index 00000000..4300e454
--- /dev/null
+++ b/include/PICA/pica_vert_config.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	// Configuration struct used
+	struct VertConfig {
+		PICAHash::HashType shaderHash;
+		PICAHash::HashType opdescHash;
+		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
+		bool usingUbershader;
+
+		// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
+		// As the padding will get hashed and memcmp'd...
+		u32 pad{};
+
+		bool operator==(const VertConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
+		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				// Mask out unused bits
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
+			}
+		}
+	};
+}  // namespace PICA
+
+static_assert(sizeof(PICA::VertConfig) == 56);
+
+// Override std::hash for our vertex config class
+template <>
+struct std::hash<PICA::VertConfig> {
+	std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
\ No newline at end of file
diff --git a/include/PICA/regs.hpp b/include/PICA/regs.hpp
index 636e8f7c..3185d350 100644
--- a/include/PICA/regs.hpp
+++ b/include/PICA/regs.hpp
@@ -396,6 +396,25 @@ namespace PICA {
 		GreaterOrEqual = 7,
 	};
 
+	enum class LogicOpMode : u32 {
+		Clear = 0,
+		And = 1,
+		ReverseAnd = 2,
+		Copy = 3,
+		Set = 4,
+		InvertedCopy = 5,
+		Nop = 6,
+		Invert = 7,
+		Nand = 8,
+		Or = 9,
+		Nor = 10,
+		Xor = 11,
+		Equiv = 12,
+		InvertedAnd = 13,
+		ReverseOr = 14,
+		InvertedOr = 15,
+	};
+
 	enum class FogMode : u32 {
 		Disabled = 0,
 		Fog = 5,
diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index e5f57c72..1040d2ff 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -107,6 +107,11 @@ class PICAShader {
 	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
 	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
+	
+	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
+	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
+	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
+	using Hash = PICAHash::HashType;
 
   protected:
 	std::array<u32, 128> operandDescriptors;
@@ -125,14 +130,13 @@ class PICAShader {
 	std::array<CallInfo, 4> callInfo;
 	ShaderType type;
 
-	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
-	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
-	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
-	using Hash = PICAHash::HashType;
-
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)
 
+  public:
+	bool uniformsDirty = false;
+
+  protected:
 	bool codeHashDirty = false;
 	bool opdescHashDirty = false;
 
@@ -284,6 +288,7 @@ class PICAShader {
 				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
 				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
+			uniformsDirty = true;
 		}
 	}
 
@@ -295,6 +300,12 @@ class PICAShader {
 		u[1] = getBits<8, 8>(word);
 		u[2] = getBits<16, 8>(word);
 		u[3] = getBits<24, 8>(word);
+		uniformsDirty = true;
+	}
+
+	void uploadBoolUniform(u32 value) {
+		boolUniform = value;
+		uniformsDirty = true;
 	}
 
 	void run();
@@ -302,6 +313,10 @@ class PICAShader {
 
 	Hash getCodeHash();
 	Hash getOpdescHash();
+
+	// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
+	static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
+	void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
 };
 
 static_assert(
diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp
index 1253226f..4a5cdc13 100644
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@@ -1,8 +1,11 @@
 #pragma once
+#include <fmt/format.h>
+
+#include <map>
 #include <set>
 #include <string>
 #include <tuple>
-#include <map>
+#include <utility>
 #include <vector>
 
 #include "PICA/shader.hpp"
@@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
 			explicit Function(u32 start, u32 end) : start(start), end(end) {}
 			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
 
-			std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
-			std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
-			std::string getCallStatement() const { return getIdentifier() + "()"; }
+			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
+			// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
+			// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
+			// from within functions deep in the callstack
+			std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
+			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
 		};
 
 		std::set<Function> functions{};
@@ -93,9 +99,11 @@ namespace PICA::ShaderGen {
 
 		API api;
 		Language language;
+		bool compilationError = false;
 
 		void compileInstruction(u32& pc, bool& finished);
-		void compileRange(const AddressRange& range);
+		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
+		std::pair<u32, bool> compileRange(const AddressRange& range);
 		void callFunction(const Function& function);
 		const Function* findFunction(const AddressRange& range);
 
@@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
 		std::string getDest(u32 dest) const;
 		std::string getSwizzlePattern(u32 swizzle) const;
 		std::string getDestSwizzle(u32 destinationMask) const;
+		const char* getCondition(u32 cond, u32 refX, u32 refY);
 
 		void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
 		// Returns if the instruction uses the typical register encodings most instructions use
diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp
index 215e5adb..b6751e05 100644
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@@ -3,6 +3,7 @@
 
 #include "PICA/gpu.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen_types.hpp"
 #include "helpers.hpp"
@@ -25,11 +26,14 @@ namespace PICA::ShaderGen {
 		bool isSamplerEnabled(u32 environmentID, u32 lutID);
 
 		void compileFog(std::string& shader, const PICA::FragmentConfig& config);
+		void compileLogicOps(std::string& shader, const PICA::FragmentConfig& config);
 
 	  public:
 		FragmentGenerator(API api, Language language) : api(api), language(language) {}
-		std::string generate(const PICA::FragmentConfig& config);
+		std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
 		std::string getDefaultVertexShader();
+		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
 
 		void setTarget(API api, Language language) {
 			this->api = api;
diff --git a/include/PICA/shader_unit.hpp b/include/PICA/shader_unit.hpp
index d8d93160..80e01346 100644
--- a/include/PICA/shader_unit.hpp
+++ b/include/PICA/shader_unit.hpp
@@ -2,10 +2,9 @@
 #include "PICA/shader.hpp"
 
 class ShaderUnit {
-
-public:
-	PICAShader vs; // Vertex shader
-	PICAShader gs; // Geometry shader
+  public:
+	PICAShader vs;  // Vertex shader
+	PICAShader gs;  // Geometry shader
 
 	ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
 	void reset();
diff --git a/include/align.hpp b/include/align.hpp
new file mode 100644
index 00000000..2f9a33db
--- /dev/null
+++ b/include/align.hpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _WIN32
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
diff --git a/include/audio/dsp_shared_mem.hpp b/include/audio/dsp_shared_mem.hpp
index e776211d..272edf7e 100644
--- a/include/audio/dsp_shared_mem.hpp
+++ b/include/audio/dsp_shared_mem.hpp
@@ -324,8 +324,8 @@ namespace Audio::HLE {
 			BitField<15, 1, u32> outputBufferCountDirty;
 			BitField<16, 1, u32> masterVolumeDirty;
 
-			BitField<24, 1, u32> auxReturnVolume0Dirty;
-			BitField<25, 1, u32> auxReturnVolume1Dirty;
+			BitField<24, 1, u32> auxVolume0Dirty;
+			BitField<25, 1, u32> auxVolume1Dirty;
 			BitField<26, 1, u32> outputFormatDirty;
 			BitField<27, 1, u32> clippingModeDirty;
 			BitField<28, 1, u32> headphonesConnectedDirty;
@@ -337,7 +337,7 @@ namespace Audio::HLE {
 		/// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for
 		/// each at the final mixer.
 		float_le masterVolume;
-		std::array<float_le, 2> auxReturnVolume;
+		std::array<float_le, 2> auxVolumes;
 
 		u16_le outputBufferCount;
 		u16 pad1[2];
@@ -422,7 +422,7 @@ namespace Audio::HLE {
 
 	struct DspStatus {
 		u16_le unknown;
-		u16_le dropped_frames;
+		u16_le droppedFrames;
 		u16 pad0[0xE];
 	};
 	ASSERT_DSP_STRUCT(DspStatus, 32);
diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index c36f0500..bd717237 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -95,8 +95,7 @@ namespace Audio {
 		DSPSource() { reset(); }
 	};
 
-	class HLE_DSP : public DSPCore {
-		// The audio frame types are public in case we want to use them for unit tests
+	class DSPMixer {
 	  public:
 		template <typename T, usize channelCount = 1>
 		using Sample = std::array<T, channelCount>;
@@ -113,6 +112,43 @@ namespace Audio {
 		template <typename T>
 		using QuadFrame = Frame<T, 4>;
 
+	  private:
+		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
+		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
+		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
+		static constexpr usize mixerStageCount = 3;
+
+	  public:
+		ChannelFormat channelFormat = ChannelFormat::Stereo;
+		std::array<float, mixerStageCount> volumes;
+		std::array<bool, 2> enableAuxStages;
+
+		void reset() {
+			channelFormat = ChannelFormat::Stereo;
+
+			volumes.fill(0.0);
+			enableAuxStages.fill(false);
+		}
+	};
+
+	class HLE_DSP : public DSPCore {
+		// The audio frame types are public in case we want to use them for unit tests
+	  public:
+		template <typename T, usize channelCount = 1>
+		using Sample = DSPMixer::Sample<T, channelCount>;
+
+		template <typename T, usize channelCount>
+		using Frame = DSPMixer::Frame<T, channelCount>;
+
+		template <typename T>
+		using MonoFrame = DSPMixer::MonoFrame<T>;
+
+		template <typename T>
+		using StereoFrame = DSPMixer::StereoFrame<T>;
+
+		template <typename T>
+		using QuadFrame = DSPMixer::QuadFrame<T>;
+
 		using Source = Audio::DSPSource;
 		using SampleBuffer = Source::SampleBuffer;
 
@@ -131,6 +167,7 @@ namespace Audio {
 		std::array<Source, Audio::HLE::sourceCount> sources;  // DSP voices
 		Audio::HLE::DspMemory dspRam;
 
+		Audio::DSPMixer mixer;
 		std::unique_ptr<Audio::AAC::Decoder> aacDecoder;
 
 		void resetAudioPipe();
@@ -175,10 +212,13 @@ namespace Audio {
 
 		void handleAACRequest(const AAC::Message& request);
 		void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
+		void updateMixerConfig(HLE::SharedMemory& sharedMem);
 		void generateFrame(StereoFrame<s16>& frame);
 		void generateFrame(DSPSource& source);
 		void outputFrame();
-
+		// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
+		void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
+		
 		// Decode an entire buffer worth of audio
 		void decodeBuffer(DSPSource& source);
 
diff --git a/include/config.hpp b/include/config.hpp
index 459f0907..0cffbf93 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -20,18 +20,20 @@ struct EmulatorConfig {
 #else
 	static constexpr bool ubershaderDefault = true;
 #endif
-
+	static constexpr bool accelerateShadersDefault = true;
+	
 	bool shaderJitEnabled = shaderJitDefault;
-	bool discordRpcEnabled = false;
 	bool useUbershaders = ubershaderDefault;
+	bool accelerateShaders = accelerateShadersDefault;
 	bool accurateShaderMul = false;
+	bool discordRpcEnabled = false;
 
 	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
 	bool forceShadergenForLights = true;
 	int lightShadergenThreshold = 1;
 
 	RendererType rendererType = RendererType::OpenGL;
-	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;
+	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::HLE;
 
 	bool sdCardInserted = true;
 	bool sdWriteProtected = false;
diff --git a/include/memory.hpp b/include/memory.hpp
index 2f01aa35..bd002c54 100644
--- a/include/memory.hpp
+++ b/include/memory.hpp
@@ -298,5 +298,5 @@ private:
 
 	bool allocateMainThreadStack(u32 size);
 	Regions getConsoleRegion();
-	void copySharedFont(u8* ptr);
+	void copySharedFont(u8* ptr, u32 vaddr);
 };
diff --git a/include/renderdoc.hpp b/include/renderdoc.hpp
index 94a0f494..ea2c8a3d 100644
--- a/include/renderdoc.hpp
+++ b/include/renderdoc.hpp
@@ -35,4 +35,35 @@ namespace Renderdoc {
 	static void setOutputDir(const std::string& path, const std::string& prefix) {}
 	static constexpr bool isSupported() { return false; }
 }  // namespace Renderdoc
-#endif
\ No newline at end of file
+#endif
+
+namespace Renderdoc {
+	// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
+	struct Scope {
+		Scope() { Renderdoc::startCapture(); }
+		~Scope() { Renderdoc::endCapture(); }
+
+		Scope(const Scope&) = delete;
+		Scope& operator=(const Scope&) = delete;
+
+		Scope(Scope&&) = delete;
+		Scope& operator=(const Scope&&) = delete;
+	};
+
+	// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
+	// trigger on its own and take a capture
+	struct InstantScope {
+		InstantScope() {
+			Renderdoc::triggerCapture();
+			Renderdoc::startCapture();
+		}
+
+		~InstantScope() { Renderdoc::endCapture(); }
+		
+		InstantScope(const InstantScope&) = delete;
+		InstantScope& operator=(const InstantScope&) = delete;
+
+		InstantScope(InstantScope&&) = delete;
+		InstantScope& operator=(const InstantScope&&) = delete;
+	};
+}  // namespace Renderdoc
\ No newline at end of file
diff --git a/include/renderer.hpp b/include/renderer.hpp
index 4eacf0b1..bc5dfac6 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <array>
+#include <optional>
 #include <span>
 #include <string>
-#include <optional>
 
+#include "PICA/draw_acceleration.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "helpers.hpp"
@@ -22,9 +23,11 @@ enum class RendererType : s8 {
 };
 
 struct EmulatorConfig;
-class GPU;
 struct SDL_Window;
 
+class GPU;
+class ShaderUnit;
+
 class Renderer {
   protected:
 	GPU& gpu;
@@ -78,7 +81,11 @@ class Renderer {
 	virtual std::string getUbershader() { return ""; }
 	virtual void setUbershader(const std::string& shader) {}
 
-	virtual void setUbershaderSetting(bool value) {}
+	// This function is called on every draw call before parsing vertex data.
+	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
+	// ubershaders and shadergen, and so on.
+	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/gl_driver.hpp b/include/renderer_gl/gl_driver.hpp
new file mode 100644
index 00000000..a15c061f
--- /dev/null
+++ b/include/renderer_gl/gl_driver.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+// Information about our OpenGL/OpenGL ES driver that we should keep track of
+// Stuff like whether specific extensions are supported, and potentially things like OpenGL context information
+namespace OpenGL {
+	struct Driver {
+		bool supportsExtFbFetch = false;
+		bool supportsArmFbFetch = false;
+
+		bool supportFbFetch() const { return supportsExtFbFetch || supportsArmFbFetch; }
+	};
+}  // namespace OpenGL
\ No newline at end of file
diff --git a/include/renderer_gl/gl_state.hpp b/include/renderer_gl/gl_state.hpp
index e5591ea0..4085cabc 100644
--- a/include/renderer_gl/gl_state.hpp
+++ b/include/renderer_gl/gl_state.hpp
@@ -38,7 +38,6 @@ struct GLStateManager {
 	
 	GLuint stencilMask;
 	GLuint boundVAO;
-	GLuint boundVBO;
 	GLuint currentProgram;
 	GLuint boundUBO;
 
@@ -173,13 +172,6 @@ struct GLStateManager {
 		}
 	}
 
-	void bindVBO(GLuint handle) {
-		if (boundVBO != handle) {
-			boundVBO = handle;
-			glBindBuffer(GL_ARRAY_BUFFER, handle);
-		}
-	}
-
 	void useProgram(GLuint handle) {
 		if (currentProgram != handle) {
 			currentProgram = handle;
@@ -195,7 +187,6 @@ struct GLStateManager {
 	}
 
 	void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
-	void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
 	void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }
 
 	void setColourMask(bool r, bool g, bool b, bool a) {
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 42b8bba1..fab239f2 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -3,15 +3,21 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <memory>
+#include <optional>
 #include <span>
 #include <unordered_map>
+#include <utility>
 
 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
+#include "gl_driver.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@@ -28,9 +34,11 @@ class RendererGL final : public Renderer {
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;
 
-	OpenGL::VertexArray vao;
+	// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
+	OpenGL::VertexArray defaultVAO;
+	// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
+	OpenGL::VertexArray hwShaderVAO;
 	OpenGL::VertexBuffer vbo;
-	bool enableUbershader = true;
 
 	// Data 
 	struct {
@@ -53,6 +61,21 @@ class RendererGL final : public Renderer {
 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
+	// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
+	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
+	bool usingShortIndices = false;
+
+	// Set by prepareForDraw, metadata for indexed renders
+	GLuint minimumIndex = 0;
+	GLuint maximumIndex = 0;
+	void* hwIndexBufferOffset = nullptr;
+
+	// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
+	u32 previousAttributeMask = 0;
+
+	// Cached pointer to the current vertex shader when using HW accelerated shaders
+	OpenGL::Shader* generatedVertexShader = nullptr;
 
 	SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
 	SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@@ -70,18 +93,58 @@ class RendererGL final : public Renderer {
 	// We can compile this once and then link it with all other generated fragment shaders
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;
+	// UBO for uploading the PICA uniforms when using hw shaders
+	GLuint hwShaderUniformUBO;
+
+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
+	// Cache of fixed attribute values so that we don't do any duplicate updates
+	std::array<std::array<float, 4>, 16> fixedAttrValues;
 
 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
 	};
-	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
+
+	struct ShaderCache {
+		std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
+		std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
+
+		// Program cache indexed by GLuints for the vertex and fragment shader to use
+		// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
+		std::unordered_map<u64, CachedProgram> programCache;
+
+		void clear() {
+			for (auto& it : programCache) {
+				CachedProgram& cachedProgram = it.second;
+				cachedProgram.program.free();
+			}
+
+			for (auto& it : vertexShaderCache) {
+				if (it.second.has_value()) {
+					it.second->free();
+				}
+			}
+
+			for (auto& it : fragmentShaderCache) {
+				it.second.free();
+			}
+
+			programCache.clear();
+			vertexShaderCache.clear();
+			fragmentShaderCache.clear();
+		}
+	};
+	ShaderCache shaderCache;
 
 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
 	OpenGL::Program& getSpecializedShader();
 
 	PICA::ShaderGen::FragmentGenerator fragShaderGen;
+	OpenGL::Driver driverInfo;
 
 	MAKE_LOG_FUNCTION(log, rendererLogger)
 	void setupBlending();
@@ -93,6 +156,8 @@ class RendererGL final : public Renderer {
 	void updateFogLUT();
 	void initGraphicsContextInternal();
 
+	void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
+
   public:
 	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
 		: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@@ -110,15 +175,13 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-
-	virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
 	void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
 	void resetStateManager() { gl.reset(); }
-	void clearShaderCache();
 	void initUbershader(OpenGL::Program& program);
 
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/sdl_sensors.hpp b/include/sdl_sensors.hpp
index 6de040ec..e34721af 100644
--- a/include/sdl_sensors.hpp
+++ b/include/sdl_sensors.hpp
@@ -2,31 +2,37 @@
 
 #include <cmath>
 #include <glm/glm.hpp>
-#include <numbers>
 
 #include "helpers.hpp"
 #include "services/hid.hpp"
 
+// Convert SDL sensor readings to 3DS format
+// We use the same code for Android as well, since the values we get from Android are in the same format as SDL (m/s^2 for acceleration, rad/s for
+// rotation)
 namespace Sensors::SDL {
-	// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
-	// Returns [pitch, roll, yaw]
-	static glm::vec3 convertRotation(glm::vec3 rotation) {
-		// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
-		constexpr float scale = 180.f / std::numbers::pi * HIDService::gyroscopeCoeff;
-		// The axes are also inverted, so invert scale before the multiplication.
-		return rotation * -scale;
-	}
+    // Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
+    // Returns [pitch, roll, yaw]
+    static glm::vec3 convertRotation(glm::vec3 rotation) {
+        // Annoyingly, Android doesn't support the <numbers> header yet so we define pi ourselves
+        static constexpr double pi = 3.141592653589793;
+        // Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
+        constexpr float scale = 180.f / pi * HIDService::gyroscopeCoeff;
+        // The axes are also inverted, so invert scale before the multiplication.
+        return rotation * -scale;
+    }
 
-	static glm::vec3 convertAcceleration(float* data) {
-		// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
-		// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
-		// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
-		static constexpr float accelMax = 9.f;
+    static glm::vec3 convertAcceleration(float* data) {
+        // Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
+        // At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
+        // This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
+        static constexpr float accelMax = 9.f;
+        // We define standard gravity(g) ourself instead of using the SDL one in order for the code to work on Android too.
+        static constexpr float standardGravity = 9.80665f;
 
-		s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
-		s16 y = std::clamp<s16>(s16(data[1] / (SDL_STANDARD_GRAVITY * accelMax) * 930.f - 350.f), -930, +930);
-		s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
+        s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
+        s16 y = std::clamp<s16>(s16(data[1] / (standardGravity * accelMax) * 930.f - 350.f), -930, +930);
+        s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
 
-		return glm::vec3(x, y, z);
-	}
+        return glm::vec3(x, y, z);
+    }
 }  // namespace Sensors::SDL
diff --git a/include/services/fonts.hpp b/include/services/fonts.hpp
new file mode 100644
index 00000000..9fa84be1
--- /dev/null
+++ b/include/services/fonts.hpp
@@ -0,0 +1,84 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.h
+
+#pragma once
+
+#include <memory>
+
+#include "helpers.hpp"
+#include "swap.hpp"
+
+namespace HLE::Fonts {
+	struct CFNT {
+		u8 magic[4];
+		u16_le endianness;
+		u16_le headerSize;
+		u32_le version;
+		u32_le fileSize;
+		u32_le numBlocks;
+	};
+
+	struct SectionHeader {
+		u8 magic[4];
+		u32_le sectionSize;
+	};
+
+	struct FINF {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 fontType;
+		u8 lineFeed;
+		u16_le alterCharIndex;
+		u8 default_width[3];
+		u8 encoding;
+		u32_le tglpOffset;
+		u32_le cwdhOffset;
+		u32_le cmapOffset;
+		u8 height;
+		u8 width;
+		u8 ascent;
+		u8 reserved;
+	};
+
+	struct TGLP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 cellWidth;
+		u8 cellHeight;
+		u8 baselinePosition;
+		u8 maxCharacterWidth;
+		u32_le sheetSize;
+		u16_le numSheets;
+		u16_le sheetImageFormat;
+		u16_le numColumns;
+		u16_le numRows;
+		u16_le sheetWidth;
+		u16_le sheetHeight;
+		u32_le sheetDataOffset;
+	};
+
+	struct CMAP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le codeBegin;
+		u16_le codeEnd;
+		u16_le mappingMethod;
+		u16_le reserved;
+		u32_le nextCmapOffset;
+	};
+
+	struct CWDH {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le startIndex;
+		u16_le endIndex;
+		u32_le nextCwdhOffset;
+	};
+
+	// Relocates the internal addresses of the BCFNT Shared Font to the new base. The current base will
+	// be auto-detected based on the file headers.
+	void relocateSharedFont(u8* sharedFont, u32 newAddress);
+}  // namespace HLE::Fonts
\ No newline at end of file
diff --git a/readme.md b/readme.md
index 5f803bde..3a33fc71 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,5 @@
 # Panda3DS
-[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)
+[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)
 
 Panda3DS is an HLE, red-panda-themed Nintendo 3DS emulator written in C++ which started out as a fun project out of curiosity, but evolved into something that can sort of play games!
 
@@ -10,7 +10,7 @@ Join our Discord server by pressing on the banner below, or find us on other pla
 
 [![Discord Banner 2](https://discord.com/api/guilds/1118695732958994532/widget.png?style=banner2)](https://discord.gg/ZYbugsEmsw)
 
-![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png)
+![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png) ![screenshot4](docs/img/KirbyAndroid.png)
 
 # Download
 You can download stable builds from the Releases tab, or you can download the latest build from the tables below. Additionally, Panda3DS comes in 2 flavours on PC: A minimal SDL frontend, which does not have a GUI, and an experimental Qt 6 frontend with a proper user interface.
@@ -22,16 +22,16 @@ SDL builds (No GUI):
 |MacOS build|[![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/MacOS_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
 |Linux build|[![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Linux_AppImage_Build/master/Linux%20executable.zip)|
 
-Qt builds:
+Qt and Android builds:
 |Platform|Status|Download|
 |--------|------------|--------|
 |Windows build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Windows Executable](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Windows%20executable.zip)|
 |MacOS build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
 |Linux build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Linux%20executable.zip)|
-
+|Android build (arm64)|[![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml)|[Android APK](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Android_Build/master/Android%20APKs%20(arm64).zip)|
 
 # Compatibility
-Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is not supported, and some QoL features (including a GUI) are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!
+Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is WIP, and some QoL features are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!
 
 For documenting game compatibility, make sure to visit the [games list repository](https://github.com/Panda3DS-emu/Panda3DS-Games-List). For miscellaneous issues or more technical issues, feel free to use this repo's issues tab.
 # Why?
@@ -116,7 +116,7 @@ Panda3DS also supports controller input using the SDL2 GameController API.
 - [MelonDS](https://github.com/melonDS-emu/melonDS): "DS emulator, sorta" - Arisotura
 - [Kaizen](https://github.com/SimoneN64/Kaizen): Experimental work-in-progress low-level N64 emulator
 - [ChonkyStation](https://github.com/liuk7071/ChonkyStation): Work-in-progress PlayStation emulator
-- [shadPS4](https://github.com/georgemoralis/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
+- [shadPS4](https://github.com/shadps4-emu/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
 - [Hydra](https://github.com/hydra-emu/hydra): Cross-platform GameBoy, NES, N64 and Chip-8 emulator
 
 # Support
diff --git a/src/config.cpp b/src/config.cpp
index 70f2189c..25fded6c 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -67,6 +67,7 @@ void EmulatorConfig::load() {
 			vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
 			useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
 			accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
+			accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);
 
 			forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
 			lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
@@ -79,7 +80,7 @@ void EmulatorConfig::load() {
 		if (audioResult.is_ok()) {
 			auto audio = audioResult.unwrap();
 
-			auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "Null");
+			auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "HLE");
 			dspType = Audio::DSPCore::typeFromString(dspCoreName);
 			audioEnabled = toml::find_or<toml::boolean>(audio, "EnableAudio", false);
 		}
@@ -141,6 +142,7 @@ void EmulatorConfig::save() {
 	data["GPU"]["UseUbershaders"] = useUbershaders;
 	data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
 	data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
+	data["GPU"]["AccelerateShaders"] = accelerateShaders;
 	data["GPU"]["EnableRenderdoc"] = enableRenderdoc;
 
 	data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
new file mode 100644
index 00000000..d7df3b77
--- /dev/null
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -0,0 +1,137 @@
+#include "PICA/draw_acceleration.hpp"
+
+#include <bit>
+#include <tuple>
+
+#include "PICA/gpu.hpp"
+#include "PICA/pica_simd.hpp"
+#include "PICA/regs.hpp"
+
+void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
+	accel.indexed = indexed;
+	accel.totalAttribCount = totalAttribCount;
+	accel.enabledAttributeMask = 0;
+
+	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
+
+	if (indexed) {
+		u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
+		u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
+
+		u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Check whether the index buffer uses u16 indices or u8
+		accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if (accel.useShortIndices) {
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<true>(indexBuffer, vertexCount);
+		} else {
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<false>(indexBuffer, vertexCount);
+		}
+
+		accel.indexBuffer = indexBuffer;
+	} else {
+		accel.indexBuffer = nullptr;
+		accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
+		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
+	}
+
+	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
+	const u64 inputAttrCfg = getVertexShaderInputConfig();
+
+	u32 attrCount = 0;
+	u32 loaderOffset = 0;
+	accel.vertexDataSize = 0;
+	accel.totalLoaderCount = 0;
+
+	for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
+		auto& loaderData = attributeInfo[i];  // Get information for this attribute loader
+
+		// This loader is empty, skip it
+		if (loaderData.componentCount == 0 || loaderData.size == 0) {
+			continue;
+		}
+
+		auto& loader = accel.loaders[accel.totalLoaderCount++];
+
+		// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
+		// Which is equal to maximumIndex - minimumIndex + 1
+		const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
+		loader.size = bytes;
+
+		// Add it to the total vertex data size, aligned to 4 bytes.
+		accel.vertexDataSize += (bytes + 3) & ~3;
+
+		// Get a pointer to the data where this loader's data is stored
+		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
+		loader.data = getPointerPhys<u8>(loaderAddress);
+
+		u64 attrCfg = loaderData.getConfigFull();  // Get config1 | (config2 << 32)
+		u32 attributeOffset = 0;
+
+		for (int component = 0; component < loaderData.componentCount; component++) {
+			uint attributeIndex = (attrCfg >> (component * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+			// Vertex attributes used as padding
+			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+			if (attributeIndex >= 12) [[unlikely]] {
+				// Align attribute address up to a 4 byte boundary
+				attributeOffset = (attributeOffset + 3) & -4;
+				attributeOffset += (attributeIndex - 11) << 2;
+				continue;
+			}
+
+			const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
+			const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+			const u32 size = (attribInfo >> 2) + 1;   // Total number of components
+
+			// Size of each component based on the attribute type
+			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
+			// Mark the attribute as enabled
+			accel.enabledAttributeMask |= 1 << inputReg;
+
+			auto& attr = accel.attributeInfo[inputReg];
+			attr.componentCount = size;
+			attr.offset = attributeOffset + loaderOffset;
+			attr.stride = loaderData.size;
+			attr.type = attribType;
+			attributeOffset += size * sizePerComponent[attribType];
+		}
+
+		loaderOffset += loader.size;
+	}
+
+	u32 fixedAttributes = fixedAttribMask;
+	accel.fixedAttributes = 0;
+
+	// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
+	while (fixedAttributes != 0) {
+		// Get index of next fixed attribute and turn it off
+		const u32 index = std::countr_zero<u32>(fixedAttributes);
+		const u32 mask = 1u << index;
+		fixedAttributes ^= mask;
+
+		// PICA register this fixed attribute is meant to go to
+		const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
+		const u32 inputRegMask = 1u << inputReg;
+
+		// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
+		if ((accel.enabledAttributeMask & inputRegMask) == 0) {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
+			auto& attr = accel.attributeInfo[inputReg];
+
+			accel.fixedAttributes |= inputRegMask;
+
+			for (int i = 0; i < 4; i++) {
+				attr.fixedValue[i] = fixedAttr[i].toFloat32();
+			}
+		}
+	}
+
+	accel.canBeAccelerated = true;
+}
\ No newline at end of file
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 71d5e2e7..838d3fb3 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -126,37 +126,62 @@ void GPU::reset() {
 	externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
 	externalRegs[Framebuffer1Select] = 0;
 
-	renderer->setUbershaderSetting(config.useUbershaders);
 	renderer->reset();
 }
 
-// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
-// And whether we are going to use the shader JIT (second template parameter)
-void GPU::drawArrays(bool indexed) {
-	const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
-
-	if (indexed) {
-		if (shaderJITEnabled)
-			drawArrays<true, true>();
-		else
-			drawArrays<true, false>();
-	} else {
-		if (shaderJITEnabled)
-			drawArrays<false, true>();
-		else
-			drawArrays<false, false>();
-	}
-}
-
 static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
 
-template <bool indexed, bool useShaderJIT>
-void GPU::drawArrays() {
-	if constexpr (useShaderJIT) {
-		shaderJIT.prepare(shaderUnit.vs);
+// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
+// And whether we are going to use the shader JIT (second template parameter)
+void GPU::drawArrays(bool indexed) {
+	PICA::DrawAcceleration accel;
+
+	if (config.accelerateShaders) {
+		// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
+		// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on 
+		getAcceleratedDrawInfo(accel, indexed);
 	}
 
-	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
+
+	if (hwShaders) {
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
+		// Total # of vertices to render
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
+
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
+		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
+	} else {
+		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
+
+		if (indexed) {
+			if (shaderJITEnabled) {
+				drawArrays<true, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<true, ShaderExecMode::Interpreter>();
+			}
+		} else {
+			if (shaderJITEnabled) {
+				drawArrays<false, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<false, ShaderExecMode::Interpreter>();
+			}
+		}
+	}
+}
+
+template <bool indexed, ShaderExecMode mode>
+void GPU::drawArrays() {
+	if constexpr (mode == ShaderExecMode::JIT) {
+		shaderJIT.prepare(shaderUnit.vs);
+	} else if constexpr (mode == ShaderExecMode::Hardware) {
+		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
+		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
+	}
+
+	// We can have up to 16 attributes, each one consisting of 4 floats
+	constexpr u32 maxAttrSizeInFloats = 16 * 4;
 
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@@ -321,8 +346,6 @@ void GPU::drawArrays() {
 					}
 
 					// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
-					// Corgi does this although I'm not sure if it's actually needed for anything.
-					// TODO: Find out
 					while (component < 4) {
 						attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
 						component++;
@@ -336,13 +359,13 @@ void GPU::drawArrays() {
 
 		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
 		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-		// Ie it might attribute #0 to v2, #1 to v7, etc
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
 		for (int j = 0; j < totalAttribCount; j++) {
 			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}
 
-		if constexpr (useShaderJIT) {
+		if constexpr (mode == ShaderExecMode::JIT) {
 			shaderJIT.run(shaderUnit.vs);
 		} else {
 			shaderUnit.vs.run();
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index f805de60..4c865d12 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
+							renderer->prepareForDraw(shaderUnit, nullptr);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
 
 							switch (primType) {
@@ -300,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		}
 
 		case VertexBoolUniform: {
-			shaderUnit.vs.boolUniform = value & 0xffff;
+			shaderUnit.vs.uploadBoolUniform(value & 0xffff);
 			break;
 		}
 
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 482aa36c..467c4727 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -1,5 +1,10 @@
 #include "PICA/shader_decompiler.hpp"
 
+#include <fmt/format.h>
+
+#include <array>
+#include <cassert>
+
 #include "config.hpp"
 
 using namespace PICA;
@@ -13,11 +18,45 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
 	analysisFailed = false;
 
 	const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
-	if (function == nullptr) {
+	if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
 		analysisFailed = true;
 	}
 }
 
+// Helpers for merging parallel/series exit methods from Citra
+// Merges exit method of two parallel branches.
+static ExitMode exitParallel(ExitMode a, ExitMode b) {
+	if (a == ExitMode::Unknown) {
+		return b;
+	}
+	else if (b == ExitMode::Unknown) {
+		return a;
+	}
+	else if (a == b) {
+		return a;
+	}
+	return ExitMode::Conditional;
+}
+
+// Cascades exit method of two blocks of code.
+static ExitMode exitSeries(ExitMode a, ExitMode b) {
+	assert(a != ExitMode::AlwaysEnd);
+
+	if (a == ExitMode::Unknown) {
+		return ExitMode::Unknown;
+	}
+
+	if (a == ExitMode::AlwaysReturn) {
+		return b;
+	}
+
+	if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
+		return ExitMode::AlwaysEnd;
+	}
+
+	return ExitMode::Conditional;
+}
+
 ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
 	// Initialize exit mode to unknown by default, in order to detect things like unending loops
 	auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
@@ -32,25 +71,132 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		const u32 opcode = instruction >> 26;
 
 		switch (opcode) {
-			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
-			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
-			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
-			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
-			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
-			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
-			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
-			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
-			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
+			case ShaderOpcodes::JMPC:
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				// Register this jump address to our outLabels set
+				labels.insert(dest);
 
+				// This opens up 2 parallel paths of execution
+				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
+				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
+				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
+				return it->second;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+
+				const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Next analyze the not taken func
+				ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
+				if (num != 0) {
+					const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
+					// Check if analysis failed and return unknown if it did
+					if (analysisFailed) {
+						it->second = ExitMode::Unknown;
+						return it->second;
+					}
+
+					branchNotTakenExitMode = branchNotTakenFunc->exitMode;
+				}
+
+				auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
+				// Both branches of the if/else end, so there's nothing after the call
+				if (parallel == ExitMode::AlwaysEnd) {
+					it->second = parallel;
+					return it->second;
+				} else {
+					ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
+					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
+					it->second = conditionalExitMode;
+					return it->second;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::CALL: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				u32 dest = getBits<10, 12>(instruction);
+				const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
+				const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
+				it->second = exitMode;
+				return it->second;
+			}
+
+			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 			default: break;
 		}
 	}
 
 	// A function without control flow instructions will always reach its "return point" and return
-	return ExitMode::AlwaysReturn;
+	it->second = ExitMode::AlwaysReturn;
+	return it->second;
 }
 
-void ShaderDecompiler::compileRange(const AddressRange& range) {
+std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
 	u32 pc = range.start;
 	const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
 	bool finished = false;
@@ -58,6 +204,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
 	while (pc < end && !finished) {
 		compileInstruction(pc, finished);
 	}
+
+	return std::make_pair(pc, finished);
 }
 
 const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
@@ -71,20 +219,43 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 }
 
 void ShaderDecompiler::writeAttributes() {
+	// Annoyingly, GLES does not support having an array as an input attribute, so declare each attribute separately for now
 	decompiledShader += R"(
-		layout(location = 0) in vec4 inputs[8];
+	layout(location = 0) in vec4 attr0;
+	layout(location = 1) in vec4 attr1;
+	layout(location = 2) in vec4 attr2;
+	layout(location = 3) in vec4 attr3;
+	layout(location = 4) in vec4 attr4;
+	layout(location = 5) in vec4 attr5;
+	layout(location = 6) in vec4 attr6;
+	layout(location = 7) in vec4 attr7;
+	layout(location = 8) in vec4 attr8;
+	layout(location = 9) in vec4 attr9;
+	layout(location = 10) in vec4 attr10;
+	layout(location = 11) in vec4 attr11;
+	layout(location = 12) in vec4 attr12;
+	layout(location = 13) in vec4 attr13;
+	layout(location = 14) in vec4 attr14;
+	layout(location = 15) in vec4 attr15;
 
-		layout(std140) uniform PICAShaderUniforms {
-			vec4 uniform_float[96];
-			uvec4 uniform_int;
-			uint uniform_bool;
-		};
-	
-		vec4 temp_registers[16];
-		vec4 dummy_vec = vec4(0.0);
+	layout(std140) uniform PICAShaderUniforms {
+		vec4 uniform_f[96];
+		uvec4 uniform_i;
+		uint uniform_bool;
+	};
+
+	vec4 temp[16];
+	vec4 out_regs[16];
+	vec4 dummy_vec = vec4(0.0);
+	ivec3 addr_reg = ivec3(0);
+	bvec2 cmp_reg = bvec2(false);
+
+	vec4 uniform_indexed(int source, int offset) {
+		int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
+		uint index = uint(clipped_offs + source) & 127u;
+		return (index < 96u) ? uniform_f[index] : vec4(1.0);
+	}
 )";
-
-	decompiledShader += "\n";
 }
 
 std::string ShaderDecompiler::decompile() {
@@ -94,11 +265,14 @@ std::string ShaderDecompiler::decompile() {
 		return "";
 	}
 
-	decompiledShader = "";
+	compilationError = false;
+	decompiledShader.clear();
+	// Reserve some memory for the shader string to avoid memory allocations
+	decompiledShader.reserve(256 * 1024);
 
 	switch (api) {
 		case API::GL: decompiledShader += "#version 410 core\n"; break;
-		case API::GLES: decompiledShader += "#version 300 es\n"; break;
+		case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
 		default: break;
 	}
 
@@ -109,7 +283,7 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += R"(
 			vec4 safe_mul(vec4 a, vec4 b) {
 				vec4 res = a * b;
-				return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
+				return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
 			}
 		)";
 	}
@@ -121,17 +295,61 @@ std::string ShaderDecompiler::decompile() {
 
 	decompiledShader += "void pica_shader_main() {\n";
 	AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
-	callFunction(*findFunction(mainFunctionRange));
-	decompiledShader += "}\n";
+	auto mainFunc = findFunction(mainFunctionRange);
 
-	for (auto& func : controlFlow.functions) {
-		if (func.outLabels.size() > 0) {
-			Helpers::panic("Function with out labels");
+	decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
+
+	for (const Function& func : controlFlow.functions) {
+		if (func.outLabels.empty()) {
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+
+			auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
+			if (!finished) {
+				decompiledShader += "return false;";
+			}
+
+			decompiledShader += "}\n";
+		} else {
+			auto labels = func.outLabels;
+			labels.insert(func.start);
+
+			// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
+			// current PC
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+			decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
+			decompiledShader += "while(true){\nswitch(pc){\n";
+
+			for (u32 label : labels) {
+				decompiledShader += fmt::format("case {}u: {{", label);
+				// Fetch the next label whose address > label
+				auto it = labels.lower_bound(label + 1);
+				u32 next = (it == labels.end()) ? func.end : *it;
+
+				auto [endPC, finished] = compileRange(AddressRange(label, next));
+				if (endPC > next && !finished) {
+					labels.insert(endPC);
+					decompiledShader += fmt::format("pc = {}u; break;", endPC);
+				}
+
+				// Fallthrough to next label
+				decompiledShader += "}\n";
+			}
+
+			decompiledShader += "default: return false;\n";
+			// Exit the switch and loop
+			decompiledShader += "} }\n";
+
+			// Exit the function
+			decompiledShader += "return false;\n";
+			decompiledShader += "}\n";
 		}
+	}
 
-		decompiledShader += "void " + func.getIdentifier() + "() {\n";
-		compileRange(AddressRange(func.start, func.end));
-		decompiledShader += "}\n";
+	// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
+	// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
+	// and the renderer core will decide to use CPU shaders instead
+	if (compilationError) [[unlikely]] {
+		return "";
 	}
 
 	return decompiledShader;
@@ -139,30 +357,41 @@ std::string ShaderDecompiler::decompile() {
 
 std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
 	if (source < 0x10) {
-		return "inputs[" + std::to_string(source) + "]";
+		return "attr" + std::to_string(source);
 	} else if (source < 0x20) {
-		return "temp_registers[" + std::to_string(source - 0x10) + "]";
+		return "temp[" + std::to_string(source - 0x10) + "]";
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;
 
-		if (floatIndex >= 96) [[unlikely]] {
-			return "dummy_vec";
+		if (index == 0) {
+			if (floatIndex >= 96) [[unlikely]] {
+				return "dummy_vec";
+			}
+			return "uniform_f[" + std::to_string(floatIndex) + "]";
+		} else {
+			static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
+			return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
 		}
-		return "uniform_float[" + std::to_string(floatIndex) + "]";
 	}
 }
 
 std::string ShaderDecompiler::getDest(u32 dest) const {
 	if (dest < 0x10) {
-		return "output_registers[" + std::to_string(dest) + "]";
+		return "out_regs[" + std::to_string(dest) + "]";
 	} else if (dest < 0x20) {
-		return "temp_registers[" + std::to_string(dest - 0x10) + "]";
+		return "temp[" + std::to_string(dest - 0x10) + "]";
 	} else {
 		return "dummy_vec";
 	}
 }
 
 std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
+	// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
+	static constexpr uint noSwizzle = 0x1B;
+	if (swizzle == noSwizzle) {
+		return "";
+	}
+
 	static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
 	std::string ret(".    ");
 	
@@ -176,7 +405,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
 
 std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
 	std::string ret = ".";
-	
 	if (destinationMask & 0b1000) {
 		ret += "x";
 	}
@@ -208,11 +436,12 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 		return;
 	}
 
-	decompiledShader += dest + destSwizzle + " = ";
-	if (writtenLaneCount == 1) {
-		decompiledShader += "float(" + value + ");\n";
-	} else {
-		decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
+	// Don't write destination swizzle if all lanes are getting written to
+	decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
+	if (writtenLaneCount <= 3) {
+		decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
+	} else if (writtenLaneCount == 4) {
+		decompiledShader += fmt::format("{};\n", value);
 	}
 }
 
@@ -246,26 +475,101 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 
 		std::string dest = getDest(destIndex);
 
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
-		}
-
-		if (invertSources) {
-			Helpers::panic("GLSL recompiler: Inverted instruction");
-		}
-
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
-			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
-			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
-			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
-			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
+			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
+			case ShaderOpcodes::MUL:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
+				} else {
+					setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
+			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;
 
-			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
-			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
-			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
+			case ShaderOpcodes::DP3:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::DP4:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
+			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
+			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
+			case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
+			case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;
 
-			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
+			case ShaderOpcodes::SLT:
+			case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::SGE:
+			case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::DPH:
+			case ShaderOpcodes::DPHI:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
+				static constexpr std::array<const char*, 8> operators = {
+					// The last 2 operators always return true and are handled specially
+					"==", "!=", "<", "<=", ">", ">=", "", "",
+				};
+
+				const u32 cmpY = getBits<21, 3>(instruction);
+				const u32 cmpX = getBits<24, 3>(instruction);
+
+				// Compare x first
+				if (cmpX >= 6) {
+					decompiledShader += "cmp_reg.x = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
+				}
+
+				// Then compare Y
+				if (cmpY >= 6) {
+					decompiledShader += "cmp_reg.y = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
+				}
+				break;
+			}
+
+			case ShaderOpcodes::MOVA: {
+				const bool writeX = getBit<3>(operandDescriptor);  // Should we write the x component of the address register?
+				const bool writeY = getBit<2>(operandDescriptor);
+
+				if (writeX && writeY) {
+					decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
+				} else if (writeX) {
+					decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
+				} else if (writeY) {
+					decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
+				}
+				break;
+			}
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
 		const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
@@ -299,23 +603,156 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		src3 += getSwizzlePattern(swizzle3);
 
 		std::string dest = getDest(destIndex);
-
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
+		if (!config.accurateShaderMul) {
+			setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
+		} else {
+			setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
 		}
-
-		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
 	} else {
 		switch (opcode) {
-			case ShaderOpcodes::END: finished = true; return;
-			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
+			case ShaderOpcodes::JMPC: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 condOp = getBits<22, 2>(instruction);
+				const uint refY = getBit<24>(instruction);
+				const uint refX = getBit<25>(instruction);
+				const char* condition = getCondition(condOp, refX, refY);
+
+				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
+				break;
+			}
+
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+				const u32 mask = 1u << bit;
+				const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we jump if bit = 1, otherwise 0
+
+				decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
+				break;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
+
+				if (opcode == ShaderOpcodes::IFC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*conditionalFunc);
+				decompiledShader += "}\n";
+
+				pc = dest;
+				if (num > 0) {
+					const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
+					pc = dest + num;
+
+					decompiledShader += "else { ";
+					callFunction(*elseFunc);
+					decompiledShader += "}\n";
+
+					if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
+						finished = true;
+						return;
+					}
+				}
+
+				return;
+			}
+
+			case ShaderOpcodes::CALL:
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
+
+				// Handle conditions for CALLC/CALLU
+				if (opcode == ShaderOpcodes::CALLC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else if (opcode == ShaderOpcodes::CALLU) {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*calledFunc);
+
+				// Close brackets for CALLC/CALLU
+				if (opcode != ShaderOpcodes::CALL) {
+					decompiledShader += "}";
+				}
+
+				if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 uniformIndex = getBits<22, 2>(instruction);
+
+				// loop counter = uniform.y
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format(
+					"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
+					"16u) & 0xFFu)) {{\n",
+					pc, pc, uniformIndex, pc, uniformIndex
+				);
+
+				AddressRange range(pc + 1, dest + 1);
+				const Function* func = findFunction(range);
+				callFunction(*func);
+				decompiledShader += "}\n";
+
+				// Jump to the end of the loop. We don't want to compile the code inside the loop again.
+				// This will be incremented by 1 due to the pc++ at the end of this loop.
+				pc = dest;
+
+				if (func->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::END:
+				decompiledShader += "return true;\n";
+				finished = true;
+				return;
+
+			case ShaderOpcodes::NOP: break;
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	}
 
 	pc++;
 }
 
-
 bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 	const u32 opcode = instruction >> 26;
 	switch (opcode) {
@@ -339,16 +776,57 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 		case ShaderOpcodes::SLT:
 		case ShaderOpcodes::SLTI:
 		case ShaderOpcodes::SGE:
-		case ShaderOpcodes::SGEI: return true;
+		case ShaderOpcodes::SGEI:
+		case ShaderOpcodes::LITP: return true;
 
 		default: return false;
 	}
 }
 
-void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
+void ShaderDecompiler::callFunction(const Function& function) {
+	switch (function.exitMode) {
+		// This function always ends, so call it and return true to signal that we're gonna be ending the shader
+		case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
+		// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
+		case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
+		// This function will not end. Just call it like a normal function.
+		default: decompiledShader += function.getCallStatement() + ";\n"; break;
+	}
+}
 
 std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
 	ShaderDecompiler decompiler(shader, config, entrypoint, api, language);
 
 	return decompiler.decompile();
 }
+
+const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
+	static constexpr std::array<const char*, 16> conditions = {
+		// ref(Y, X) = (0, 0)
+		"!all(cmp_reg)",
+		"all(not(cmp_reg))",
+		"!cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (0, 1)
+		"cmp_reg.x || !cmp_reg.y",
+		"cmp_reg.x && !cmp_reg.y",
+		"cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (1, 0)
+		"!cmp_reg.x || cmp_reg.y",
+		"!cmp_reg.x && cmp_reg.y",
+		"!cmp_reg.x",
+		"cmp_reg.y",
+
+		// ref(Y, X) = (1, 1)
+		"any(cmp_reg)",
+		"all(cmp_reg)",
+		"cmp_reg.x",
+		"cmp_reg.y",
+	};
+	const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
+
+	return conditions[key];
+}
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index 69f74930..44a75134 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -1,6 +1,14 @@
+#include <fmt/format.h>
+
+#include <utility>
+
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+
+// We can include the driver headers here since they shouldn't have any actual API-specific code
+#include "renderer_gl/gl_driver.hpp"
+
 using namespace PICA;
 using namespace PICA::ShaderGen;
 
@@ -34,6 +42,8 @@ static constexpr const char* uniformDefinition = R"(
 
 std::string FragmentGenerator::getDefaultVertexShader() {
 	std::string ret = "";
+	// Reserve some space (128KB) in the output string to avoid too many allocations later
+	ret.reserve(128 * 1024);
 
 	switch (api) {
 		case API::GL: ret += "#version 410 core"; break;
@@ -94,7 +104,7 @@ std::string FragmentGenerator::getDefaultVertexShader() {
 	return ret;
 }
 
-std::string FragmentGenerator::generate(const FragmentConfig& config) {
+std::string FragmentGenerator::generate(const FragmentConfig& config, void* driverInfo) {
 	std::string ret = "";
 
 	switch (api) {
@@ -103,6 +113,27 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
 		default: break;
 	}
 
+	// For GLES we need to enable & use the framebuffer fetch extension in order to emulate logic ops
+	bool emitLogicOps = api == API::GLES && config.outConfig.logicOpMode != PICA::LogicOpMode::Copy && driverInfo != nullptr;
+
+	if (emitLogicOps) {
+		auto driver = static_cast<OpenGL::Driver*>(driverInfo);
+
+		// If the driver does not support framebuffer fetch at all, don't emit logic op code
+		if (!driver->supportFbFetch()) {
+			emitLogicOps = false;
+		}
+		
+		// Figure out which fb fetch extension we have and enable it
+		else {
+			if (driver->supportsExtFbFetch) {
+				ret += "\n#extension GL_EXT_shader_framebuffer_fetch : enable\n#define fb_color fragColor\n";
+			} else if (driver->supportsArmFbFetch) {
+				ret += "\n#extension GL_ARM_shader_framebuffer_fetch : enable\n#define fb_color gl_LastFragColorARM[0]\n";
+			}
+		}
+	}
+
 	bool unimplementedFlag = false;
 	if (api == API::GLES) {
 		ret += R"(
@@ -192,10 +223,13 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
 	}
 
 	compileFog(ret, config);
-
 	applyAlphaTest(ret, config);
 
-	ret += "fragColor = combinerOutput;\n}"; // End of main function
+	if (!emitLogicOps) {
+		ret += "fragColor = combinerOutput;\n}";  // End of main function
+	} else {
+		compileLogicOps(ret, config);
+	}
 
 	return ret;
 }
@@ -671,3 +705,135 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);";
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }
+
+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
+	// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
+	std::array<std::pair<int, int>, 32> outputMappings{};
+	// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
+	std::array<u8, 16> vsOutputRegisters;
+
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+
+		for (int i = 0; i < vertConfig.outputCount; i++) {
+			const u32 config = vertConfig.outmaps[i];
+			for (int j = 0; j < 4; j++) {
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
+			}
+		}
+	}
+
+	auto getSemanticName = [&](u32 semanticIndex) {
+		auto [reg, lane] = outputMappings[semanticIndex];
+		return fmt::format("out_regs[{}][{}]", reg, lane);
+	};
+
+	std::string semantics = fmt::format(
+		R"(
+	vec4 a_coords = vec4({}, {}, {}, {});
+	vec4 a_quaternion = vec4({}, {}, {}, {});
+	vec4 a_vertexColour = vec4({}, {}, {}, {});
+	vec2 a_texcoord0 = vec2({}, {});
+	float a_texcoord0_w = {};
+	vec2 a_texcoord1 = vec2({}, {});
+	vec2 a_texcoord2 = vec2({}, {});
+	vec3 a_view = vec3({}, {}, {});
+)",
+		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
+		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
+		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
+		getSemanticName(18), getSemanticName(19), getSemanticName(20)
+	);
+
+	if (usingUbershader) {
+		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
+		return picaSource;
+	} else {
+		// TODO: Uniforms and don't hardcode fixed-function semantic indices...
+		std::string ret = picaSource;
+		if (api == API::GLES) {
+			ret += "\n#define USING_GLES\n";
+		}
+
+		ret += uniformDefinition;
+
+		ret += R"(
+out vec4 v_quaternion;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+
+#ifndef USING_GLES
+	out float gl_ClipDistance[2];
+#endif
+
+void main() {
+	pica_shader_main();
+)";
+	// Transfer fixed function fragment registers from vertex shader output to the fragment shader
+	ret += semantics;
+	
+	ret += R"(
+	gl_Position = a_coords;
+	vec4 colourAbs = abs(a_vertexColour);
+	v_colour = min(colourAbs, vec4(1.f));
+
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+	v_quaternion = a_quaternion;
+
+#ifndef USING_GLES
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipCoords, a_coords);
+#endif
+})";
+		return ret;
+	}
+}
+
+void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
+	if (api != API::GLES) [[unlikely]] {
+		Helpers::warn("Shadergen: Unsupported API for compileLogicOps");
+		shader += "fragColor = combinerOutput;\n}"; // End of main function
+
+		return;
+	}
+	
+	shader += "fragColor = ";
+	switch (config.outConfig.logicOpMode) {
+		case PICA::LogicOpMode::Copy: shader += "combinerOutput"; break;
+		case PICA::LogicOpMode::Nop: shader += "fb_color"; break;
+		case PICA::LogicOpMode::Clear: shader += "vec4(0.0)"; break;
+		case PICA::LogicOpMode::Set: shader += "vec4(1.0)"; break;
+		case PICA::LogicOpMode::InvertedCopy: shader += "vec4(uvec4(combinerOutput * 255.0) ^ uvec4(0xFFu)) * (1.0 / 255.0)"; break;
+
+		default:
+			shader += "combinerOutput";
+			Helpers::warn("Shadergen: Unimplemented logic op mode");
+			break;
+	}
+
+	shader += ";\n}"; // End of main function
+}
diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp
index 759849a8..6b291d31 100644
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@@ -34,4 +34,5 @@ void PICAShader::reset() {
 
 	codeHashDirty = true;
 	opdescHashDirty = true;
+	uniformsDirty = true;
 }
\ No newline at end of file
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index b4f9ab02..70a8e71d 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -76,6 +76,7 @@ namespace Audio {
 			source.reset();
 		}
 
+		mixer.reset();
 		// Note: Reset audio pipe AFTER resetting all pipes, otherwise the new data will be yeeted
 		resetAudioPipe();
 	}
@@ -250,6 +251,8 @@ namespace Audio {
 
 			source.isBufferIDDirty = false;
 		}
+
+		performMix(read, write);
 	}
 
 	void HLE_DSP::updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients) {
@@ -465,6 +468,50 @@ namespace Audio {
 		}
 	}
 
+	void HLE_DSP::performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion) {
+		updateMixerConfig(readRegion);
+		// TODO: Do the actual audio mixing
+
+		auto& dspStatus = writeRegion.dspStatus;
+		// Stub the DSP status. It's unknown what the "unknown" field is but Citra sets it to 0, so we do too to be safe
+		dspStatus.droppedFrames = 0;
+		dspStatus.unknown = 0;
+	}
+
+	void HLE_DSP::updateMixerConfig(Audio::HLE::SharedMemory& sharedMem) {
+		auto& config = sharedMem.dspConfiguration;
+		// No configs have been changed, so there's nothing to update
+		if (config.dirtyRaw == 0) {
+			return;
+		}
+
+		if (config.outputFormatDirty) {
+			mixer.channelFormat = config.outputFormat;
+		}
+		
+		if (config.masterVolumeDirty) {
+			mixer.volumes[0] = config.masterVolume;
+		}
+
+		if (config.auxVolume0Dirty) {
+			mixer.volumes[1] = config.auxVolumes[0];
+		}
+		
+		if (config.auxVolume1Dirty) {
+			mixer.volumes[2] = config.auxVolumes[1];
+		}
+
+		if (config.auxBusEnable0Dirty) {
+			mixer.enableAuxStages[0] = config.auxBusEnable[0] != 0;
+		}
+
+		if (config.auxBusEnable1Dirty) {
+			mixer.enableAuxStages[1] = config.auxBusEnable[1] != 0;
+		}
+
+		config.dirtyRaw = 0;
+	}
+
 	HLE_DSP::SampleBuffer HLE_DSP::decodePCM8(const u8* data, usize sampleCount, Source& source) {
 		SampleBuffer decodedSamples(sampleCount);
 
@@ -585,7 +632,7 @@ namespace Audio {
 		AAC::Message response;
 
 		switch (request.command) {
-			case AAC::Command::EncodeDecode:
+			case AAC::Command::EncodeDecode: {
 				// Dummy response to stop games from hanging
 				response.resultCode = AAC::ResultCode::Success;
 				response.decodeResponse.channelCount = 2;
@@ -596,10 +643,13 @@ namespace Audio {
 				response.command = request.command;
 				response.mode = request.mode;
 
-				// We've already got an AAC decoder but it's currently disabled until mixing & output is properly implemented
-				// TODO: Uncomment this when the time comes
-				// aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
+				// TODO: Make this a toggle in config.toml. Currently we have it off by default until we finish the DSP mixer.
+				constexpr bool enableAAC = false;
+				if (enableAAC) {
+					aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
+				}
 				break;
+			}
 
 			case AAC::Command::Init:
 			case AAC::Command::Shutdown:
diff --git a/src/core/kernel/memory_management.cpp b/src/core/kernel/memory_management.cpp
index aeac6269..26f50023 100644
--- a/src/core/kernel/memory_management.cpp
+++ b/src/core/kernel/memory_management.cpp
@@ -136,7 +136,7 @@ void Kernel::mapMemoryBlock() {
 				break;
 
 			case KernelHandles::FontSharedMemHandle:
-				mem.copySharedFont(ptr);
+				mem.copySharedFont(ptr, addr);
 				break;
 
 			case KernelHandles::CSNDSharedMemHandle:
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 09b49eee..57eac8ca 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -7,6 +7,7 @@
 
 #include "config_mem.hpp"
 #include "resource_limits.hpp"
+#include "services/fonts.hpp"
 #include "services/ptm.hpp"
 
 CMRC_DECLARE(ConsoleFonts);
@@ -51,7 +52,7 @@ void Memory::reset() {
 		if (e.handle == KernelHandles::FontSharedMemHandle) {
 			// Read font size from the cmrc filesystem the font is stored in
 			auto fonts = cmrc::ConsoleFonts::get_filesystem();
-			e.size = fonts.open("CitraSharedFontUSRelocated.bin").size();
+			e.size = fonts.open("SharedFontReplacement.bin").size();
 		}
 
 		e.mapped = false;
@@ -520,10 +521,13 @@ Regions Memory::getConsoleRegion() {
 	return region;
 }
 
-void Memory::copySharedFont(u8* pointer) {
+void Memory::copySharedFont(u8* pointer, u32 vaddr) {
 	auto fonts = cmrc::ConsoleFonts::get_filesystem();
-	auto font = fonts.open("CitraSharedFontUSRelocated.bin");
+	auto font = fonts.open("SharedFontReplacement.bin");
 	std::memcpy(pointer, font.begin(), font.size());
+
+	// Relocate shared font to the address it's being loaded to
+	HLE::Fonts::relocateSharedFont(pointer, vaddr);
 }
 
 std::optional<u64> Memory::getProgramID() {
diff --git a/src/core/renderer_gl/gl_state.cpp b/src/core/renderer_gl/gl_state.cpp
index 3d1c0681..785cac41 100644
--- a/src/core/renderer_gl/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
 }
 
 void GLStateManager::resetBuffers() {
-	boundVBO = 0;
 	boundUBO = 0;
-
-	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 }
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 5146370a..90b8f910 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -2,13 +2,16 @@
 
 #include <stb_image_write.h>
 
+#include <bit>
 #include <cmrc/cmrc.hpp>
 
-#include "config.hpp"
 #include "PICA/float_types.hpp"
-#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
+#include "PICA/pica_frag_uniforms.hpp"
+#include "PICA/pica_simd.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader_decompiler.hpp"
+#include "config.hpp"
 #include "math_util.hpp"
 
 CMRC_DECLARE(RendererGL);
@@ -24,7 +27,7 @@ void RendererGL::reset() {
 	colourBufferCache.reset();
 	textureCache.reset();
 
-	clearShaderCache();
+	shaderCache.clear();
 
 	// Init the colour/depth buffer settings to some random defaults on reset
 	colourBufferLoc = 0;
@@ -77,40 +80,56 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.useProgram(displayProgram);
 	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object
 
+	// Create stream buffers for vertex, index and uniform buffers
+	static constexpr usize hwIndexBufferSize = 2_MB;
+	static constexpr usize hwVertexBufferSize = 16_MB;
+
+	hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
+	hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
+
 	// Allocate memory for the shadergen fragment uniform UBO
 	glGenBuffers(1, &shadergenFragmentUBO);
 	gl.bindUBO(shadergenFragmentUBO);
 	glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);
 
-	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
-	gl.bindVBO(vbo);
-	vao.create();
-	gl.bindVAO(vao);
+	// Allocate memory for the accelerated vertex shader uniform UBO
+	glGenBuffers(1, &hwShaderUniformUBO);
+	gl.bindUBO(hwShaderUniformUBO);
+	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
+
+	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
+	vbo.bind();
+	// Initialize the VAO used when not using hw shaders
+	defaultVAO.create();
+	gl.bindVAO(defaultVAO);
 
 	// Position (x, y, z, w) attributes
-	vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
-	vao.enableAttribute(0);
+	defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
+	defaultVAO.enableAttribute(0);
 	// Quaternion attribute
-	vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
-	vao.enableAttribute(1);
+	defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
+	defaultVAO.enableAttribute(1);
 	// Colour attribute
-	vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
-	vao.enableAttribute(2);
+	defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
+	defaultVAO.enableAttribute(2);
 	// UV 0 attribute
-	vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
-	vao.enableAttribute(3);
+	defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
+	defaultVAO.enableAttribute(3);
 	// UV 1 attribute
-	vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
-	vao.enableAttribute(4);
+	defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
+	defaultVAO.enableAttribute(4);
 	// UV 0 W-component attribute
-	vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
-	vao.enableAttribute(5);
+	defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
+	defaultVAO.enableAttribute(5);
 	// View
-	vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
-	vao.enableAttribute(6);
+	defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
+	defaultVAO.enableAttribute(6);
 	// UV 2 attribute
-	vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
-	vao.enableAttribute(7);
+	defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
+	defaultVAO.enableAttribute(7);
+
+	// Initialize the VAO used for hw shaders
+	hwShaderVAO.create();
 
 	dummyVBO.create();
 	dummyVAO.create();
@@ -165,8 +184,18 @@ void RendererGL::initGraphicsContextInternal() {
 	OpenGL::clearColor();
 	OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);
 
+	// Initialize fixed attributes
+	for (int i = 0; i < fixedAttrValues.size(); i++) {
+		fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
+		glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
+	}
+
 	reset();
 
+	// Populate our driver info structure
+	driverInfo.supportsExtFbFetch = (GLAD_GL_EXT_shader_framebuffer_fetch != 0);
+	driverInfo.supportsArmFbFetch = (GLAD_GL_ARM_shader_framebuffer_fetch != 0);
+
 	// Initialize the default vertex shader used with shadergen
 	std::string defaultShadergenVSSource = fragShaderGen.getDefaultVertexShader();
 	defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
@@ -414,29 +443,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};
 
-	bool usingUbershader = enableUbershader;
-	if (usingUbershader) {
-		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
-		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
-
-		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
-		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
-		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
-			usingUbershader = false;
-		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
-	} else {
-		OpenGL::Program& program = getSpecializedShader();
-		gl.useProgram(program);
-	}
-
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	gl.bindVBO(vbo);
-	gl.bindVAO(vao);
+
+	// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
+	if (!usingAcceleratedShader) {
+		vbo.bind();
+		gl.bindVAO(defaultVAO);
+	}
 
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@@ -454,38 +468,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const int depthFunc = getBits<4, 3>(depthControl);
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
-
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
 
-	// Update ubershader uniforms
-	if (usingUbershader) {
-		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
-		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
-		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
-
-		if (oldDepthScale != depthScale) {
-			oldDepthScale = depthScale;
-			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
-		}
-
-		if (oldDepthOffset != depthOffset) {
-			oldDepthOffset = depthOffset;
-			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
-		}
-
-		if (oldDepthmapEnable != depthMapEnable) {
-			oldDepthmapEnable = depthMapEnable;
-			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
-		}
-
-		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
-		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
-		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
-		setupUbershaderTexEnv();
-	}
-
 	bindTexturesToSlots();
-
 	if (gpu.fogLUTDirty) {
 		updateFogLUT();
 	}
@@ -528,8 +513,32 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	setupStencilTest(stencilEnable);
 
-	vbo.bufferVertsSub(vertices);
-	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	if (!usingAcceleratedShader) {
+		vbo.bufferVertsSub(vertices);
+		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	} else {
+		if (performIndexedRender) {
+			// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
+			hwIndexBuffer->Bind();
+
+			if (glDrawRangeElementsBaseVertex != nullptr) [[likely]] {
+				glDrawRangeElementsBaseVertex(
+					primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
+					hwIndexBufferOffset, -GLint(minimumIndex)
+				);
+			} else {
+				// If glDrawRangeElementsBaseVertex is not available then prepareForDraw will have subtracted the base vertex from the index buffer
+				// for us, so just use glDrawRangeElements
+				glDrawRangeElements(
+					primitiveTopology, 0, GLint(maximumIndex - minimumIndex), GLsizei(vertices.size()),
+					usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, hwIndexBufferOffset
+				);
+			}
+		} else {
+			// When doing non-indexed rendering, just use glDrawArrays
+			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+		}
+	}
 }
 
 void RendererGL::display() {
@@ -836,34 +845,53 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
 }
 
 OpenGL::Program& RendererGL::getSpecializedShader() {
-	constexpr uint uboBlockBinding = 2;
+	constexpr uint vsUBOBlockBinding = 1;
+	constexpr uint fsUBOBlockBinding = 2;
 
 	PICA::FragmentConfig fsConfig(regs);
+	// If we're not on GLES, ignore the logic op configuration and don't generate redundant shaders for it, since we use hw logic ops
+#ifndef USING_GLES
+	fsConfig.outConfig.logicOpMode = PICA::LogicOpMode(0);
+#endif
 
-	CachedProgram& programEntry = shaderCache[fsConfig];
+	OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
+	if (!fragShader.exists()) {
+		std::string fs = fragShaderGen.generate(fsConfig);
+		fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
+	}
+
+	// Get the handle of the current vertex shader
+	OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
+	// And form the key for looking up a shader program
+	const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
+
+	CachedProgram& programEntry = shaderCache.programCache[programKey];
 	OpenGL::Program& program = programEntry.program;
 
 	if (!program.exists()) {
-		std::string fs = fragShaderGen.generate(fsConfig);
-
-		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
-		program.create({defaultShadergenVs, fragShader});
+		program.create({vertexShader, fragShader});
 		gl.useProgram(program);
 
-		fragShader.free();
-
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
 
-		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
+		// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
-		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
-		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
+		glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
+
+		if (usingAcceleratedShader) {
+			uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
+			glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
+		}
+	}
+	glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
+	if (usingAcceleratedShader) {
+		glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
 	}
-	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);
 
 	// Upload uniform data to our shader's UBO
 	PICA::FragmentUniforms uniforms;
@@ -953,6 +981,101 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }
 
+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	// First we figure out if we will be using an ubershader
+	bool usingUbershader = emulatorConfig->useUbershaders;
+	if (usingUbershader) {
+		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
+		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
+
+		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
+		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
+		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+			usingUbershader = false;
+		}
+	}
+
+	// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
+	// TODO: Ubershader support for accelerated shaders
+	usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
+
+	if (usingAcceleratedShader) {
+		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
+
+		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
+		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
+		if (!shader.has_value()) {
+			// Initialize shader to a "null" shader (handle == 0)
+			shader = OpenGL::Shader();
+
+			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
+				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
+				Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+			);
+
+			// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
+			// it to the GPU
+			if (!picaShaderSource.empty()) {
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
+				shader->create({vertexShaderSource}, OpenGL::Vertex);
+			}
+		}
+
+		// Shader generation did not work out, so set usingAcceleratedShader to false
+		if (!shader->exists()) {
+			usingAcceleratedShader = false;
+		} else {
+			generatedVertexShader = &(*shader);
+			gl.bindUBO(hwShaderUniformUBO);
+
+			if (shaderUnit.vs.uniformsDirty) {
+				shaderUnit.vs.uniformsDirty = false;
+				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
+			}
+
+			performIndexedRender = accel->indexed;
+			minimumIndex = GLsizei(accel->minimumIndex);
+			maximumIndex = GLsizei(accel->maximumIndex);
+
+			// Upload vertex data and index buffer data to our GPU
+			accelerateVertexUpload(shaderUnit, accel);
+		}
+	}
+
+	if (!usingUbershader) {
+		OpenGL::Program& program = getSpecializedShader();
+		gl.useProgram(program);
+	} else { // Bind ubershader & load ubershader uniforms
+		gl.useProgram(triangleProgram);
+
+		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
+		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
+		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
+
+		if (oldDepthScale != depthScale) {
+			oldDepthScale = depthScale;
+			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
+		}
+
+		if (oldDepthOffset != depthOffset) {
+			oldDepthOffset = depthOffset;
+			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
+		}
+
+		if (oldDepthmapEnable != depthMapEnable) {
+			oldDepthmapEnable = depthMapEnable;
+			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
+		}
+
+		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
+		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
+		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
+		setupUbershaderTexEnv();
+	}
+
+	return usingAcceleratedShader;
+}
+
 void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;
@@ -966,7 +1089,7 @@ void RendererGL::screenshot(const std::string& name) {
 
 	// Flip the image vertically
 	for (int y = 0; y < height; y++) {
-		memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
+		std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
 		// Swap R and B channels
 		for (int x = 0; x < width; x++) {
 			std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
@@ -978,21 +1101,12 @@ void RendererGL::screenshot(const std::string& name) {
 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
 }
 
-void RendererGL::clearShaderCache() {
-	for (auto& shader : shaderCache) {
-		CachedProgram& cachedProgram = shader.second;
-		cachedProgram.program.free();
-	}
-
-	shaderCache.clear();
-}
-
 void RendererGL::deinitGraphicsContext() {
 	// Invalidate all surface caches since they'll no longer be valid
 	textureCache.reset();
 	depthBufferCache.reset();
 	colourBufferCache.reset();
-	clearShaderCache();
+	shaderCache.clear();
 
 	// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
 	// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@@ -1041,3 +1155,99 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
 }
+
+void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	u32 buffer = 0;  // Vertex buffer index for non-fixed attributes
+	u32 attrCount = 0;
+
+	const u32 totalAttribCount = accel->totalAttribCount;
+
+	static constexpr GLenum attributeFormats[4] = {
+		GL_BYTE,           // 0: Signed byte
+		GL_UNSIGNED_BYTE,  // 1: Unsigned byte
+		GL_SHORT,          // 2: Short
+		GL_FLOAT,          // 3: Float
+	};
+
+	const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
+
+	// Update index buffer if necessary
+	if (accel->indexed) {
+		usingShortIndices = accel->useShortIndices;
+		const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
+
+		hwIndexBuffer->Bind();
+		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
+		hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
+
+		std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
+		// If we don't have glDrawRangeElementsBaseVertex, we must subtract the base index value from our index buffer manually
+		if (glDrawRangeElementsBaseVertex == nullptr) [[unlikely]] {
+			const u32 indexCount = regs[PICA::InternalRegs::VertexCountReg];
+			usingShortIndices ? PICA::IndexBuffer::subtractBaseIndex<true>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex)
+							  : PICA::IndexBuffer::subtractBaseIndex<false>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex);
+		}
+
+		hwIndexBuffer->Unmap(indexBufferSize);
+	}
+
+	hwVertexBuffer->Bind();
+	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
+
+	gl.bindVAO(hwShaderVAO);
+
+	// Enable or disable vertex attributes as needed
+	const u32 currentAttributeMask = accel->enabledAttributeMask;
+	// Use bitwise xor to calculate which attributes changed
+	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
+	
+	while (attributeMaskDiff != 0) {
+		// Get index of next different attribute and turn it off
+		const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
+		const u32 mask = 1u << index;
+		attributeMaskDiff ^= mask;
+
+		if ((currentAttributeMask & mask) != 0) {
+			// Attribute was disabled and is now enabled
+			hwShaderVAO.enableAttribute(index);
+		} else {
+			// Attribute was enabled and is now disabled
+			hwShaderVAO.disableAttribute(index);
+		}
+	}
+
+	previousAttributeMask = currentAttributeMask;
+
+	// Upload the data for each (enabled) attribute loader into our vertex buffer
+	for (int i = 0; i < accel->totalLoaderCount; i++) {
+		auto& loader = accel->loaders[i];
+
+		std::memcpy(vertexData, loader.data, loader.size);
+		vertexData += loader.size;
+	}
+
+	hwVertexBuffer->Unmap(accel->vertexDataSize);
+
+	// Iterate over the 16 PICA input registers and configure how they should be fetched.
+	for (int i = 0; i < 16; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		const u32 attributeMask = 1u << i;
+
+		if (accel->fixedAttributes & attributeMask) {
+			auto& attrValue = fixedAttrValues[i];
+			// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
+			if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
+				attrValue[3] != attrib.fixedValue[3]) {
+				std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
+				glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			}
+		} else if (accel->enabledAttributeMask & attributeMask) {
+			glVertexAttribPointer(
+				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
+			);
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/core/services/fonts.cpp b/src/core/services/fonts.cpp
new file mode 100644
index 00000000..ec4652ee
--- /dev/null
+++ b/src/core/services/fonts.cpp
@@ -0,0 +1,109 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.cpp
+
+#include "services/fonts.hpp"
+
+#include <cstring>
+
+namespace HLE::Fonts {
+	void relocateSharedFont(u8* sharedFont, u32 newAddress) {
+		constexpr u32 sharedFontStartOffset = 0x80;
+		const u8* cfntData = &sharedFont[sharedFontStartOffset];
+
+		CFNT cfnt;
+		std::memcpy(&cfnt, cfntData, sizeof(cfnt));
+
+		u32 assumedCmapOffset = 0;
+		u32 assumedCwdhOffset = 0;
+		u32 assumedTglpOffset = 0;
+		u32 firstCmapOffset = 0;
+		u32 firstCwdhOffset = 0;
+		u32 firstTglpOffset = 0;
+
+		// First discover the location of sections so that the rebase offset can be auto-detected
+		u32 currentOffset = sharedFontStartOffset + cfnt.headerSize;
+		for (uint block = 0; block < cfnt.numBlocks; ++block) {
+			const u8* data = &sharedFont[currentOffset];
+
+			SectionHeader sectionHeader;
+			std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
+
+			if (firstCmapOffset == 0 && std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
+				firstCmapOffset = currentOffset;
+			} else if (firstCwdhOffset == 0 && std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
+				firstCwdhOffset = currentOffset;
+			} else if (firstTglpOffset == 0 && std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
+				firstTglpOffset = currentOffset;
+			} else if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
+				Fonts::FINF finf;
+				std::memcpy(&finf, data, sizeof(finf));
+
+				assumedCmapOffset = finf.cmapOffset - sizeof(SectionHeader);
+				assumedCwdhOffset = finf.cwdhOffset - sizeof(SectionHeader);
+				assumedTglpOffset = finf.tglpOffset - sizeof(SectionHeader);
+			}
+
+			currentOffset += sectionHeader.sectionSize;
+		}
+
+		u32 previousBase = assumedCmapOffset - firstCmapOffset;
+		if ((previousBase != assumedCwdhOffset - firstCwdhOffset) || (previousBase != assumedTglpOffset - firstTglpOffset)) {
+			Helpers::warn("You shouldn't be seeing this. Shared Font file offsets might be borked?");
+		}
+
+		u32 offset = newAddress - previousBase;
+
+		// Reset pointer back to start of sections and do the actual rebase
+		currentOffset = sharedFontStartOffset + cfnt.headerSize;
+		for (uint block = 0; block < cfnt.numBlocks; ++block) {
+			u8* data = &sharedFont[currentOffset];
+
+			SectionHeader sectionHeader;
+			std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
+
+			if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
+				Fonts::FINF finf;
+				std::memcpy(&finf, data, sizeof(finf));
+
+				// Relocate the offsets in the FINF section
+				finf.cmapOffset += offset;
+				finf.cwdhOffset += offset;
+				finf.tglpOffset += offset;
+
+				std::memcpy(data, &finf, sizeof(finf));
+			} else if (std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
+				Fonts::CMAP cmap;
+				std::memcpy(&cmap, data, sizeof(cmap));
+
+				// Relocate the offsets in the CMAP section
+				if (cmap.nextCmapOffset != 0) {
+					cmap.nextCmapOffset += offset;
+				}
+
+				std::memcpy(data, &cmap, sizeof(cmap));
+			} else if (std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
+				Fonts::CWDH cwdh;
+				std::memcpy(&cwdh, data, sizeof(cwdh));
+
+				// Relocate the offsets in the CWDH section
+				if (cwdh.nextCwdhOffset != 0) {
+					cwdh.nextCwdhOffset += offset;
+				}
+
+				std::memcpy(data, &cwdh, sizeof(cwdh));
+			} else if (std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
+				Fonts::TGLP tglp;
+				std::memcpy(&tglp, data, sizeof(tglp));
+
+				// Relocate the offsets in the TGLP section
+				tglp.sheetDataOffset += offset;
+				std::memcpy(data, &tglp, sizeof(tglp));
+			}
+
+			currentOffset += sectionHeader.sectionSize;
+		}
+	}
+}  // namespace HLE::Fonts
diff --git a/src/core/services/fonts/CitraSharedFontUSRelocated.bin b/src/core/services/fonts/SharedFontReplacement.bin
similarity index 100%
rename from src/core/services/fonts/CitraSharedFontUSRelocated.bin
rename to src/core/services/fonts/SharedFontReplacement.bin
diff --git a/src/jni_driver.cpp b/src/jni_driver.cpp
index e4ce2b39..fbfae8ff 100644
--- a/src/jni_driver.cpp
+++ b/src/jni_driver.cpp
@@ -8,6 +8,7 @@
 #include "renderer_gl/renderer_gl.hpp"
 #include "services/hid.hpp"
 #include "android_utils.hpp"
+#include "sdl_sensors.hpp"
 
 std::unique_ptr<Emulator> emulator = nullptr;
 HIDService* hidService = nullptr;
@@ -43,6 +44,7 @@ extern "C" {
 AlberFunction(void, functionName) (JNIEnv* env, jobject obj, type value) { emulator->getConfig().settingName = value; }
 
 MAKE_SETTING(setShaderJitEnabled, jboolean, shaderJitEnabled)
+MAKE_SETTING(setAccurateShaderMulEnable, jboolean, accurateShaderMul)
 
 #undef MAKE_SETTING
 
@@ -87,6 +89,7 @@ AlberFunction(void, Finalize)(JNIEnv* env, jobject obj) {
 	emulator = nullptr;
 	hidService = nullptr;
 	renderer = nullptr;
+	romLoaded = false;
 }
 
 AlberFunction(jboolean, HasRomLoaded)(JNIEnv* env, jobject obj) { return romLoaded; }
@@ -110,6 +113,19 @@ AlberFunction(void, TouchScreenUp)(JNIEnv* env, jobject obj) { hidService->relea
 AlberFunction(void, KeyUp)(JNIEnv* env, jobject obj, jint keyCode) { hidService->releaseKey((u32)keyCode); }
 AlberFunction(void, KeyDown)(JNIEnv* env, jobject obj, jint keyCode) { hidService->pressKey((u32)keyCode); }
 
+AlberFunction(void, SetGyro)(JNIEnv* env, jobject obj, jfloat roll, jfloat pitch, jfloat yaw) {
+    auto rotation = Sensors::SDL::convertRotation({ float(roll), float(pitch), float(yaw) });
+    hidService->setPitch(s16(rotation.x));
+    hidService->setRoll(s16(rotation.y));
+    hidService->setYaw(s16(rotation.z));
+}
+
+AlberFunction(void, SetAccel)(JNIEnv* env, jobject obj, jfloat rawX, jfloat rawY, jfloat rawZ) {
+    float data[3] = { float(rawX), float(rawY), float(rawZ) };
+    auto accel = Sensors::SDL::convertAcceleration(data);
+    hidService->setAccel(accel.x, accel.y, accel.z);
+}
+
 AlberFunction(void, SetCirclepadAxis)(JNIEnv* env, jobject obj, jint x, jint y) {
 	hidService->setCirclepadX((s16)x);
 	hidService->setCirclepadY((s16)y);
@@ -139,4 +155,4 @@ int AndroidUtils::openDocument(const char* path, const char* perms) {
     env->DeleteLocalRef(jmode);
 
     return (int)result;
-}
\ No newline at end of file
+}
diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp
index 3f92cddd..ce5fddaf 100644
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@@ -163,13 +163,14 @@ static int fetchVariableRange(std::string key, int min, int max) {
 
 static void configInit() {
 	static const retro_variable values[] = {
-		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled"
-																	  : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled" : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_accelerate_shaders",
+		 EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
 		{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
 		{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
 																	  : "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
 		{"panda3ds_use_vsync", "Enable VSync; enabled|disabled"},
-		{"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"},
+		{"panda3ds_dsp_emulation", "DSP emulation; HLE|LLE|Null"},
 		{"panda3ds_use_audio", "Enable audio; disabled|enabled"},
 		{"panda3ds_use_virtual_sd", "Enable virtual SD card; enabled|disabled"},
 		{"panda3ds_write_protect_virtual_sd", "Write protect virtual SD card; disabled|enabled"},
@@ -197,6 +198,8 @@ static void configUpdate() {
 	config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
 	config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
+	config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
+
 	config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);
 	config.discordRpcEnabled = false;
diff --git a/src/lua.cpp b/src/lua.cpp
index 6a16ab5b..5b78cec2 100644
--- a/src/lua.cpp
+++ b/src/lua.cpp
@@ -130,6 +130,32 @@ MAKE_MEMORY_FUNCTIONS(32)
 MAKE_MEMORY_FUNCTIONS(64)
 #undef MAKE_MEMORY_FUNCTIONS
 
+static int readFloatThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	lua_pushnumber(L, (lua_Number)Helpers::bit_cast<float, u32>(LuaManager::g_emulator->getMemory().read32(vaddr)));
+	return 1;
+}
+
+static int writeFloatThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	const float value = (float)lua_tonumber(L, 2);
+	LuaManager::g_emulator->getMemory().write32(vaddr, Helpers::bit_cast<u32, float>(value));
+	return 0;
+}
+
+static int readDoubleThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	lua_pushnumber(L, (lua_Number)Helpers::bit_cast<double, u64>(LuaManager::g_emulator->getMemory().read64(vaddr)));
+	return 1;
+}
+
+static int writeDoubleThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	const double value = (double)lua_tonumber(L, 2);
+	LuaManager::g_emulator->getMemory().write64(vaddr, Helpers::bit_cast<u64, double>(value));
+	return 0;
+}
+
 static int getAppIDThunk(lua_State* L) {
 	std::optional<u64> id = LuaManager::g_emulator->getMemory().getProgramID();
 	
@@ -248,10 +274,14 @@ static constexpr luaL_Reg functions[] = {
 	{ "__read16", read16Thunk },
 	{ "__read32", read32Thunk },
 	{ "__read64", read64Thunk },
+	{ "__readFloat", readFloatThunk },
+	{ "__readDouble", readDoubleThunk },
 	{ "__write8", write8Thunk} ,
 	{ "__write16", write16Thunk },
 	{ "__write32", write32Thunk },
 	{ "__write64", write64Thunk },
+	{ "__writeFloat", writeFloatThunk },
+	{ "__writeDouble", writeDoubleThunk },
 	{ "__getAppID", getAppIDThunk },
 	{ "__pause", pauseThunk }, 
 	{ "__resume", resumeThunk },
@@ -273,10 +303,15 @@ void LuaManager::initializeThunks() {
 		read16 = function(addr) return GLOBALS.__read16(addr) end,
 		read32 = function(addr) return GLOBALS.__read32(addr) end,
 		read64 = function(addr) return GLOBALS.__read64(addr) end,
+		readFloat = function(addr) return GLOBALS.__readFloat(addr) end,
+		readDouble = function(addr) return GLOBALS.__readDouble(addr) end,
+
 		write8 = function(addr, value) GLOBALS.__write8(addr, value) end,
 		write16 = function(addr, value) GLOBALS.__write16(addr, value) end,
 		write32 = function(addr, value) GLOBALS.__write32(addr, value) end,
 		write64 = function(addr, value) GLOBALS.__write64(addr, value) end,
+		writeFloat = function(addr, value) GLOBALS.__writeFloat(addr, value) end,
+		writeDouble = function(addr, value) GLOBALS.__writeDouble(addr, value) end,
 
 		getAppID = function()
 			local ffi = require("ffi")
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java
index f7a3394b..bb3945b5 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java
@@ -24,13 +24,16 @@ public class AlberDriver {
 	public static native void KeyUp(int code);
 	public static native void SetCirclepadAxis(int x, int y);
 	public static native void TouchScreenUp();
-	public static native void TouchScreenDown(int x, int y);
+	public static native void TouchScreenDown(int x, int y);;
+	public static native void SetGyro(float roll, float pitch, float yaw);
+	public static native void SetAccel(float x, float y, float z);
 	public static native void Pause();
 	public static native void Resume();
 	public static native void LoadLuaScript(String script);
 	public static native byte[] GetSmdh();
 
 	public static native void setShaderJitEnabled(boolean enable);
+	public static native void setAccurateShaderMulEnable(boolean enable);
 
 	public static int openDocument(String path, String mode) {
 		try {
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java
index 83d18d99..503684ac 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java
@@ -3,11 +3,22 @@ package com.panda3ds.pandroid.app;
 import android.app.ActivityManager;
 import android.app.PictureInPictureParams;
 import android.content.Intent;
+import android.content.res.Configuration;
+import android.hardware.Sensor;
+import android.hardware.SensorEvent;
+import android.hardware.SensorEventListener;
+import android.hardware.SensorManager;
+import android.opengl.Matrix;
 import android.os.Build;
 import android.os.Bundle;
+import android.renderscript.Matrix3f;
+import android.renderscript.Matrix4f;
+import android.util.Log;
 import android.util.Rational;
+import android.view.Display;
 import android.view.KeyEvent;
 import android.view.MotionEvent;
+import android.view.Surface;
 import android.view.View;
 import android.view.ViewGroup;
 import android.view.WindowManager;
@@ -25,6 +36,7 @@ import com.panda3ds.pandroid.app.game.EmulatorCallback;
 import com.panda3ds.pandroid.data.config.GlobalConfig;
 import com.panda3ds.pandroid.input.InputHandler;
 import com.panda3ds.pandroid.input.InputMap;
+import com.panda3ds.pandroid.math.Vector3;
 import com.panda3ds.pandroid.utils.Constants;
 import com.panda3ds.pandroid.view.PandaGlSurfaceView;
 import com.panda3ds.pandroid.view.PandaLayoutController;
@@ -32,7 +44,7 @@ import com.panda3ds.pandroid.view.ds.DsLayoutManager;
 import com.panda3ds.pandroid.view.renderer.ConsoleRenderer;
 import com.panda3ds.pandroid.view.utils.PerformanceView;
 
-public class GameActivity extends BaseActivity implements EmulatorCallback {
+public class GameActivity extends BaseActivity implements EmulatorCallback, SensorEventListener {
 	private final DrawerFragment drawerFragment = new DrawerFragment();
 	private final AlberInputListener inputListener = new AlberInputListener(this);
 	private ConsoleRenderer renderer;
@@ -74,6 +86,19 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 			((FrameLayout) findViewById(R.id.panda_gl_frame)).addView(view, new FrameLayout.LayoutParams(ViewGroup.LayoutParams.WRAP_CONTENT, ViewGroup.LayoutParams.WRAP_CONTENT));
 		}
 		swapScreens(GlobalConfig.get(GlobalConfig.KEY_CURRENT_DS_LAYOUT));
+		registerSensors();
+	}
+
+	private void registerSensors() {
+		SensorManager sensorManager = (SensorManager) getSystemService(SENSOR_SERVICE);
+		Sensor accel = sensorManager.getDefaultSensor(Sensor.TYPE_ACCELEROMETER);
+		if (accel != null) {
+			sensorManager.registerListener(this, accel, 1);
+		}
+		Sensor gryro = sensorManager.getDefaultSensor(Sensor.TYPE_GYROSCOPE);
+		if (gryro != null) {
+			sensorManager.registerListener(this, gryro, 1);
+		}
 	}
 
 	private void changeOverlayVisibility(boolean visible) {
@@ -85,7 +110,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 	@Override
 	protected void onResume() {
 		super.onResume();
-                getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+		getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 		getWindow().getDecorView().setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN | View.SYSTEM_UI_FLAG_HIDE_NAVIGATION);
 		getWindow().addFlags(WindowManager.LayoutParams.FLAG_FULLSCREEN);
 		InputHandler.reset();
@@ -94,6 +119,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 		if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O_MR1) {
 			getTheme().applyStyle(R.style.GameActivityNavigationBar, true);
 		}
+		registerSensors();
 	}
 
 	private void enablePIP() {
@@ -113,6 +139,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 	protected void onPause() {
 		super.onPause();
 
+		((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
 		InputHandler.reset();
 		if (GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE)) {
 			if (Build.VERSION.SDK_INT > Build.VERSION_CODES.O) {
@@ -174,10 +201,48 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 
 	@Override
 	protected void onDestroy() {
+		((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
 		if (AlberDriver.HasRomLoaded()) {
 			AlberDriver.Finalize();
 		}
 
 		super.onDestroy();
 	}
+
+	private float getDeviceRotationAngle() {
+		if (getWindow().getDecorView() == null || getWindow().getDecorView().getDisplay() == null)
+			return 0.0f;
+
+		int rotation = getWindow().getDecorView().getDisplay().getRotation();
+		switch (rotation) {
+			case Surface.ROTATION_90: return 90.0f;
+			case Surface.ROTATION_180: return 180.0f;
+			case Surface.ROTATION_270: return -90.0f;
+			default: return 0.0f;
+		}
+	}
+
+	@Override
+	public void onSensorChanged(SensorEvent event) {
+		if (AlberDriver.HasRomLoaded()) {
+			Sensor sensor = event.sensor;
+			switch (sensor.getType()) {
+				case Sensor.TYPE_ACCELEROMETER: {
+					float[] values = event.values;
+					Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
+					vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
+					AlberDriver.SetAccel(vec3.x, vec3.y, vec3.z);
+				} break;
+				case Sensor.TYPE_GYROSCOPE: {
+					float[] values = event.values;
+					Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
+					vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
+					AlberDriver.SetGyro(vec3.x, vec3.y, vec3.z);
+				} break;
+			}
+		}
+	}
+
+	@Override
+	public void onAccuracyChanged(Sensor sensor, int accuracy) {}
 }
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java
index 9426c098..ae8d49ad 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java
@@ -26,6 +26,10 @@ public abstract class BasePreferenceFragment extends PreferenceFragmentCompat {
 		((SwitchPreferenceCompat)findPreference(id)).setChecked(value);
 	}
 
+	protected void setSummaryValue(String id,String text) {
+		findPreference(id).setSummary(text);
+	}
+
 	protected void setActivityTitle(@StringRes int titleId) {
 		ActionBar header = ((AppCompatActivity) requireActivity()).getSupportActionBar();
 		if (header != null) {
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java
index 176bab14..8d04403e 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java
@@ -22,6 +22,7 @@ public class AdvancedPreferences extends BasePreferenceFragment {
 
         setItemClick("performanceMonitor", pref -> GlobalConfig.set(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY, ((SwitchPreferenceCompat) pref).isChecked()));
         setItemClick("shaderJit", pref -> GlobalConfig.set(GlobalConfig.KEY_SHADER_JIT, ((SwitchPreferenceCompat) pref).isChecked()));
+        setItemClick("accurateShaderMul", pref -> GlobalConfig.set(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY, ((SwitchPreferenceCompat) pref).isChecked()));
         setItemClick("loggerService", pref -> {
             boolean checked = ((SwitchPreferenceCompat) pref).isChecked();
             Context ctx = PandroidApplication.getAppContext();
@@ -46,5 +47,6 @@ public class AdvancedPreferences extends BasePreferenceFragment {
         ((SwitchPreferenceCompat) findPreference("performanceMonitor")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY));
         ((SwitchPreferenceCompat) findPreference("loggerService")).setChecked(GlobalConfig.get(GlobalConfig.KEY_LOGGER_SERVICE));
         ((SwitchPreferenceCompat) findPreference("shaderJit")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
+        ((SwitchPreferenceCompat) findPreference("accurateShaderMul")).setChecked(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));
     }
 }
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java
index 0b003db9..86182c3b 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java
@@ -1,7 +1,13 @@
 package com.panda3ds.pandroid.app.preferences;
 
+import android.net.Uri;
 import android.os.Bundle;
+import android.util.Log;
+import android.widget.Toast;
 
+import androidx.activity.result.ActivityResultCallback;
+import androidx.activity.result.ActivityResultLauncher;
+import androidx.activity.result.contract.ActivityResultContracts;
 import androidx.annotation.Nullable;
 import androidx.preference.SwitchPreferenceCompat;
 
@@ -10,8 +16,11 @@ import com.panda3ds.pandroid.app.PreferenceActivity;
 import com.panda3ds.pandroid.app.base.BasePreferenceFragment;
 import com.panda3ds.pandroid.app.preferences.screen_editor.ScreenLayoutsPreference;
 import com.panda3ds.pandroid.data.config.GlobalConfig;
+import com.panda3ds.pandroid.utils.FileUtils;
 
-public class GeneralPreferences extends BasePreferenceFragment {
+public class GeneralPreferences extends BasePreferenceFragment implements ActivityResultCallback<Uri> {
+    private final ActivityResultContracts.OpenDocument openFolderContract = new ActivityResultContracts.OpenDocument();
+    private ActivityResultLauncher<String[]> pickFileRequest;
     @Override
     public void onCreatePreferences(@Nullable Bundle savedInstanceState, @Nullable String rootKey) {
         setPreferencesFromResource(R.xml.general_preference, rootKey);
@@ -21,6 +30,11 @@ public class GeneralPreferences extends BasePreferenceFragment {
         setItemClick("behavior.pictureInPicture", (pref)-> GlobalConfig.set(GlobalConfig.KEY_PICTURE_IN_PICTURE, ((SwitchPreferenceCompat)pref).isChecked()));
         setActivityTitle(R.string.general);
         refresh();
+
+        setItemClick("games.aes_key", pref -> pickFileRequest.launch(new String[]{ "text/plain" }));
+        setItemClick("games.seed_db", pref -> pickFileRequest.launch(new String[]{ "application/octet-stream" }));
+
+        pickFileRequest = registerForActivityResult(openFolderContract, this);
     }
 
     @Override
@@ -31,5 +45,45 @@ public class GeneralPreferences extends BasePreferenceFragment {
 
     private void refresh() {
         setSwitchValue("behavior.pictureInPicture", GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE));
+        setSummaryValue("games.aes_key", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/aes_keys.txt") ? R.string.file_available : R.string.file_not_available), "aes_keys.txt"));
+        setSummaryValue("games.seed_db", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/seeddb.bin") ? R.string.file_available : R.string.file_not_available), "seeddb.bin"));
     }
+
+	@Override
+	public void onDestroy() {
+		super.onDestroy();
+		if (pickFileRequest != null) {
+			pickFileRequest.unregister();
+			pickFileRequest = null;
+		}
+	}
+
+	@Override
+	public void onActivityResult(Uri result) {
+		if (result != null) {
+			String path = result.toString();
+			Log.w("File", path + " -> " + FileUtils.getName(path));
+			switch (String.valueOf(FileUtils.getName(path))) {
+				case "aes_keys.txt":
+				case "seeddb.bin": {
+					String name = FileUtils.getName(path);
+					if (FileUtils.getLength(path) < 1024 * 256) {
+						String sysdataFolder = FileUtils.getPrivatePath() + "/sysdata";
+						if (!FileUtils.exists(sysdataFolder)) {
+							FileUtils.createDir(FileUtils.getPrivatePath(), "sysdata");
+						}
+						if (FileUtils.exists(sysdataFolder + "/" + name)) {
+							FileUtils.delete(sysdataFolder + "/" + name);
+						}
+						FileUtils.copyFile(path, FileUtils.getPrivatePath() + "/sysdata/", name);
+						Toast.makeText(getActivity(), String.format(getString(R.string.file_imported), name), Toast.LENGTH_LONG).show();
+					} else {
+						Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show();
+					}
+				} break;
+				default: Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show(); break;
+			}
+			refresh();
+		}
+	}
 }
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java
index 4bc6e299..14c4e576 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java
@@ -23,7 +23,7 @@ public class ScreenEditorPreference extends Fragment {
     @Override
     public View onCreateView(@NonNull LayoutInflater inflater, @Nullable ViewGroup container, @Nullable Bundle savedInstanceState) {
         layout = new LinearLayout(container.getContext());
-        layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_HIDE_NAVIGATION|View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
+        layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
         return layout;
     }
 
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java
index ca6fad90..397eef05 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java
@@ -95,7 +95,7 @@ public class AppDataDocumentProvider extends DocumentsProvider {
     private void includeFile(MatrixCursor cursor, File file) {
         int flags = 0;
         if (file.isDirectory()) {
-            flags = Document.FLAG_DIR_SUPPORTS_CREATE;
+            flags = Document.FLAG_DIR_SUPPORTS_CREATE | Document.FLAG_SUPPORTS_DELETE;
         } else {
             flags = Document.FLAG_SUPPORTS_WRITE | Document.FLAG_SUPPORTS_REMOVE | Document.FLAG_SUPPORTS_DELETE;
         }
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java
index 448d561a..c8750f88 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java
@@ -22,6 +22,7 @@ public class GlobalConfig {
     public static DataModel data;
 
     public static final Key<Boolean> KEY_SHADER_JIT = new Key<>("emu.shader_jit", true);
+    public static final Key<Boolean> KEY_ACCURATE_SHADER_MULTIPLY = new Key<>("emu.accurate_shader_mul", false);
     public static final Key<Boolean> KEY_PICTURE_IN_PICTURE = new Key<>("app.behavior.pictureInPicture", false);
     public static final Key<Boolean> KEY_SHOW_PERFORMANCE_OVERLAY = new Key<>("dev.performanceOverlay", false);
     public static final Key<Boolean> KEY_LOGGER_SERVICE = new Key<>("dev.loggerService", false);
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Quaternion.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Quaternion.java
new file mode 100644
index 00000000..7c485c6c
--- /dev/null
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Quaternion.java
@@ -0,0 +1,31 @@
+package com.panda3ds.pandroid.math;
+
+public class Quaternion {
+	public float x, y, z, w;
+	public Quaternion(float x, float y, float z, float w) {
+		this.x = x;
+		this.y = y;
+		this.z = z;
+		this.w = w;
+	}
+
+	public Quaternion fromEuler(Vector3 euler) {
+		float x = euler.x;
+		float y = euler.y;
+		float z = euler.z;
+
+		double c1 = Math.cos(x / 2.0);
+		double c2 = Math.cos(y / 2.0);
+		double c3 = Math.cos(z / 2.0);
+
+		double s1 = Math.sin(x / 2.0);
+		double s2 = Math.sin(y / 2.0);
+		double s3 = Math.sin(z / 2.0);
+
+		this.x = (float) (s1 * c2 * c3 + c1 * s2 * s3);
+		this.y = (float) (c1 * s2 * c3 - s1 * c2 * s3);
+		this.z = (float) (c1 * c2 * s3 + s1 * s2 * c3);
+		this.w = (float) (c1 * c2 * c3 - s1 * s2 * s3);
+		return this;
+	}
+}
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Vector3.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Vector3.java
new file mode 100644
index 00000000..055972ec
--- /dev/null
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Vector3.java
@@ -0,0 +1,32 @@
+package com.panda3ds.pandroid.math;
+
+public class Vector3 {
+	private final Quaternion quaternion = new Quaternion(0, 0, 0, 0);
+	public float x, y, z;
+
+	public Vector3(float x, float y, float z) {
+		this.x = x;
+		this.y = y;
+		this.z = z;
+	}
+
+	public Vector3 rotateByEuler(Vector3 euler) {
+		this.quaternion.fromEuler(euler);
+
+		float x = this.x, y = this.y, z = this.z;
+		float qx = this.quaternion.x;
+		float qy = this.quaternion.y;
+		float qz = this.quaternion.z;
+		float qw = this.quaternion.w;
+
+		float ix = qw * x + qy * z - qz * y;
+		float iy = qw * y + qz * x - qx * z;
+		float iz = qw * z + qx * y - qy * x;
+		float iw = -qx * x - qy * qz * z;
+
+		this.x = ix * qw + iw * -qx + iy * -qz - iz * -qy;
+		this.y = iy * qw + iw * -qy + iz * -qx - ix * -qz;
+		this.z = iz * qw + iw * -qz + ix * -qy - iy * -qx;
+		return this;
+	}
+}
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java
index 85245454..26b029d9 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java
@@ -230,6 +230,10 @@ public class FileUtils {
         return parseFile(path).lastModified();
     }
 
+    public static long getLength(String path) {
+        return parseFile(path).length();
+    }
+
     public static String[] listFiles(String path) {
         DocumentFile folder = parseFile(path);
         DocumentFile[] files = folder.listFiles();
diff --git a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java
index c57421ab..3fb435b4 100644
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java
@@ -93,6 +93,7 @@ public class PandaGlRenderer implements GLSurfaceView.Renderer, ConsoleRenderer
 
 		AlberDriver.Initialize();
 		AlberDriver.setShaderJitEnabled(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
+		AlberDriver.setAccurateShaderMulEnable(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));
 
 		// If loading the ROM failed, display an error message and early exit
 		if (!AlberDriver.LoadRom(romPath)) {
diff --git a/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml b/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml
index f2e144c3..521f199e 100644
--- a/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml
+++ b/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml
@@ -90,4 +90,12 @@
     <string name="behavior">Comportamento</string>
     <string name="invalid_game">Jogo invalido</string>
     <string name="tools">Ferramentas</string>
+    <string name="pref_accurate_shader_title">Multiplicação precisa de shader</string>
+    <string name="pref_accurate_shader_summary">Usar calculos mais precisos para shaders</string>
+    <string name="pref_game_crypto_keys">Importar chaves</string>
+    <string name="file_available">%s disponível</string>
+    <string name="file_not_available">%s não disponível</string>
+    <string name="pref_game_seed_db_keys">Importar SeedDB</string>
+    <string name="invalid_file">Arquivo inválido</string>
+    <string name="file_imported">%s Importado</string>
 </resources>
diff --git a/src/pandroid/app/src/main/res/values/strings.xml b/src/pandroid/app/src/main/res/values/strings.xml
index 25569528..63a6c246 100644
--- a/src/pandroid/app/src/main/res/values/strings.xml
+++ b/src/pandroid/app/src/main/res/values/strings.xml
@@ -96,4 +96,12 @@
     <string name="region_taiwan">Taiwan</string>
     <string name="behavior">Behavior</string>
     <string name="invalid_game">Invalid game</string>
+    <string name="pref_accurate_shader_title">Accurate shader multiplication</string>
+    <string name="pref_accurate_shader_summary">Can improve rendering at a small performance loss</string>
+    <string name="pref_game_crypto_keys">Import keys</string>
+    <string name="file_imported">%s imported</string>
+    <string name="file_available">%s available</string>
+    <string name="file_not_available">%s not available</string>
+    <string name="pref_game_seed_db_keys">Import SeedDB</string>
+    <string name="invalid_file">Invalid file</string>
 </resources>
diff --git a/src/pandroid/app/src/main/res/xml/advanced_preferences.xml b/src/pandroid/app/src/main/res/xml/advanced_preferences.xml
index 6602fdfd..9ef81dbf 100644
--- a/src/pandroid/app/src/main/res/xml/advanced_preferences.xml
+++ b/src/pandroid/app/src/main/res/xml/advanced_preferences.xml
@@ -28,5 +28,11 @@
             app:summary="@string/pref_shader_jit_summary"
             app:iconSpaceReserved="false"/>
 
+        <SwitchPreferenceCompat
+            app:key="accurateShaderMul"
+            app:title="@string/pref_accurate_shader_title"
+            app:summary="@string/pref_accurate_shader_summary"
+            app:iconSpaceReserved="false"/>
+
     </PreferenceCategory>
 </PreferenceScreen>
\ No newline at end of file
diff --git a/src/pandroid/app/src/main/res/xml/general_preference.xml b/src/pandroid/app/src/main/res/xml/general_preference.xml
index 3e2d93c8..4352ee54 100644
--- a/src/pandroid/app/src/main/res/xml/general_preference.xml
+++ b/src/pandroid/app/src/main/res/xml/general_preference.xml
@@ -23,6 +23,16 @@
             app:title="@string/pref_game_folders"
             app:summary="@string/pref_game_folders_summary"
             app:iconSpaceReserved="false"/>
+        <Preference
+            android:key="games.aes_key"
+            app:title="@string/pref_game_crypto_keys"
+            app:summary="@string/pref_game_crypto_keys"
+            app:iconSpaceReserved="false"/>
+        <Preference
+            android:key="games.seed_db"
+            app:title="@string/pref_game_seed_db_keys"
+            app:summary="@string/pref_game_crypto_keys"
+            app:iconSpaceReserved="false"/>
     </PreferenceCategory>
     <PreferenceCategory
         app:title="@string/behavior"
diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
new file mode 100644
index 00000000..b7a40603
--- /dev/null
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -0,0 +1,288 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gl/stream_buffer.h"
+
+#include <array>
+#include <cstdio>
+
+#include "align.hpp"
+
+OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
+OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
+
+void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
+void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
+
+void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
+#ifdef GPU_DEBUG_INFO
+	if (glObjectLabel) {
+		glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
+	}
+#endif
+}
+
+namespace {
+	// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
+	class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
+	class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Base class for implementations which require syncing.
+	class SyncingStreamBuffer : public OpenGLStreamBuffer {
+	  public:
+		enum : u32 { NUM_SYNC_POINTS = 16 };
+
+		virtual ~SyncingStreamBuffer() override {
+			for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
+				glDeleteSync(m_sync_objects[i]);
+			}
+		}
+
+	  protected:
+		SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+			: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
+
+		ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
+
+		ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
+			const u32 end = GetSyncIndexForOffset(offset);
+			for (; m_used_block_index < end; m_used_block_index++) {
+				if (m_sync_objects[m_used_block_index]) {
+					Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
+				}
+
+				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}
+		}
+
+		ALWAYS_INLINE void WaitForSync(GLsync& sync) {
+			glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+			glDeleteSync(sync);
+			sync = nullptr;
+		}
+
+		ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
+			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
+			for (; m_available_block_index < end; m_available_block_index++) {
+				if (!m_sync_objects[m_available_block_index]) [[unlikely]] {
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
+				}
+
+				WaitForSync(m_sync_objects[m_available_block_index]);
+			}
+		}
+
+		void AllocateSpace(u32 size) {
+			// add sync objects for writes since the last allocation
+			AddSyncsForOffset(m_position);
+
+			// wait for sync objects for the space we want to use
+			EnsureSyncsWaitedForOffset(m_position + size);
+
+			// wrap-around?
+			if ((m_position + size) > m_size) {
+				// current position ... buffer end
+				AddSyncsForOffset(m_size);
+
+				// rewind, and try again
+				m_position = 0;
+
+				// wait for the sync at the start of the buffer
+				WaitForSync(m_sync_objects[0]);
+				m_available_block_index = 1;
+
+				// and however much more we need to satisfy the allocation
+				EnsureSyncsWaitedForOffset(size);
+				m_used_block_index = 0;
+			}
+		}
+
+		u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
+
+		u32 m_position = 0;
+		u32 m_used_block_index = 0;
+		u32 m_available_block_index = NUM_SYNC_POINTS;
+		u32 m_bytes_per_block;
+		std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
+	};
+
+	class BufferStorageStreamBuffer : public SyncingStreamBuffer {
+	  public:
+		~BufferStorageStreamBuffer() override {
+			glBindBuffer(m_target, m_buffer_id);
+			glUnmapBuffer(m_target);
+			glBindBuffer(m_target, 0);
+		}
+
+		MappingResult Map(u32 alignment, u32 min_size) override {
+			if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
+
+			AllocateSpace(min_size);
+			if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
+			return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
+		}
+
+		u32 Unmap(u32 used_size) override {
+			if ((m_position + used_size) > m_size) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			if (!m_coherent) {
+				if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
+					glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
+				} else {
+					Bind();
+					glFlushMappedBufferRange(m_target, m_position, used_size);
+				}
+			}
+
+			const u32 prev_position = m_position;
+			m_position += used_size;
+			return prev_position;
+		}
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+
+			const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+			const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
+			if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
+				glBufferStorage(target, size, nullptr, flags);
+			else if (GLAD_GL_EXT_buffer_storage)
+				glBufferStorageEXT(target, size, nullptr, flags);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
+			AssertMsg(mapped_ptr, "Persistent buffer was mapped");
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
+		}
+
+	  private:
+		BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
+			: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
+
+		u8* m_mapped_ptr;
+		bool m_coherent;
+	};
+
+}  // namespace
+
+std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
+	std::unique_ptr<OpenGLStreamBuffer> buf;
+	if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
+		buf = BufferStorageStreamBuffer::Create(target, size);
+		if (buf) return buf;
+	}
+
+	// BufferSubData is slower on all drivers except NVIDIA...
+#if 0
+	const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+	if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
+		// Mali and Adreno drivers can't do sub-buffer tracking...
+		return BufferDataStreamBuffer::Create(target, size);
+	}
+
+	return BufferSubDataStreamBuffer::Create(target, size);
+#else
+	return BufferDataStreamBuffer::Create(target, size);
+#endif
+}
\ No newline at end of file
diff --git a/third_party/duckstation/gl/stream_buffer.h b/third_party/duckstation/gl/stream_buffer.h
new file mode 100644
index 00000000..6b3562e7
--- /dev/null
+++ b/third_party/duckstation/gl/stream_buffer.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <glad/gl.h>
+// Comment to avoid clang-format reordering the glad header
+
+#include <memory>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include "duckstation_compat.h"
+#include "helpers.hpp"
+
+class OpenGLStreamBuffer {
+  public:
+	virtual ~OpenGLStreamBuffer();
+
+	ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+	ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+	ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+	void Bind();
+	void Unbind();
+
+	void SetDebugName(std::string_view name);
+
+	struct MappingResult {
+		void* pointer;
+		u32 buffer_offset;
+		u32 index_aligned;  // offset / alignment, suitable for base vertex
+		u32 space_aligned;  // remaining space / alignment
+	};
+
+	virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
+
+	/// Returns the position in the buffer *before* the start of used_size.
+	virtual u32 Unmap(u32 used_size) = 0;
+
+	/// Returns the minimum granularity of blocks which sync objects will be created around.
+	virtual u32 GetChunkSize() const = 0;
+
+	static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
+
+  protected:
+	OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+	GLenum m_target;
+	GLuint m_buffer_id;
+	u32 m_size;
+};
\ No newline at end of file
diff --git a/third_party/fmt b/third_party/fmt
new file mode 160000
index 00000000..f8581bce
--- /dev/null
+++ b/third_party/fmt
@@ -0,0 +1 @@
+Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4