Merge branch 'master' into metal2

2025-07-07 15:52:59 +12:00 · 2024-10-31 13:45:58 +01:00 · 2024-10-31 13:45:58 +01:00 · 02f8250aff
commit 02f8250aff
parent 272c24d8e4 7ae8412919
69 changed files with 2906 additions and 319 deletions
--- a/.github/workflows/Android_Build.yml
+++ b/.github/workflows/Android_Build.yml
@ -8,7 +8,7 @@ on:

 jobs:
  x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    strategy:
      matrix:
@ -73,7 +73,7 @@ jobs:
          ./src/pandroid/app/build/outputs/apk/${{ env.BUILD_TYPE }}/app-${{ env.BUILD_TYPE }}.apk

  arm64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    strategy:
      matrix:
--- a/.github/workflows/HTTP_Build.yml
+++ b/.github/workflows/HTTP_Build.yml
@ -16,7 +16,7 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/Hydra_Build.yml
+++ b/.github/workflows/Hydra_Build.yml
@ -98,7 +98,7 @@ jobs:
          ${{github.workspace}}/docs/libretro/panda3ds_libretro.info

  Linux:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
@ -107,7 +107,7 @@ jobs:

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
      
    - name: Install newer Clang
      run: |
@ -151,7 +151,7 @@ jobs:
          ${{github.workspace}}/docs/libretro/panda3ds_libretro.info

  Android-x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
@ -160,7 +160,7 @@ jobs:

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
       
    - name: Setup Vulkan SDK
      uses: humbletim/setup-vulkan-sdk@v1.2.0
--- a/.github/workflows/Linux_AppImage_Build.yml
+++ b/.github/workflows/Linux_AppImage_Build.yml
@ -16,7 +16,7 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
@ -24,7 +24,7 @@ jobs:
      run: git submodule update --init --recursive

    - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2

    - name: Install newer Clang
      run: |
--- a/.github/workflows/Linux_Build.yml
+++ b/.github/workflows/Linux_Build.yml
@ -16,7 +16,7 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
@ -24,7 +24,7 @@ jobs:
      run: git submodule update --init --recursive

    - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev

    - name: Install newer Clang
      run: |
--- a/.github/workflows/Qt_Build.yml
+++ b/.github/workflows/Qt_Build.yml
@ -96,7 +96,7 @@ jobs:
        path: 'Alber.zip'

  Linux:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
    - uses: actions/checkout@v4
@ -105,8 +105,7 @@ jobs:

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
-       sudo add-apt-repository -y ppa:savoury1/qt-6-2
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
       sudo apt update
       sudo apt install qt6-base-dev qt6-base-private-dev

--- a/.gitmodules
+++ b/.gitmodules
@ -76,6 +76,9 @@
 [submodule "third_party/metal-cpp"]
 	path = third_party/metal-cpp
 	url = https://github.com/Panda3DS-emu/metal-cpp
+[submodule "third_party/fmt"]
+	path = third_party/fmt
+	url = https://github.com/fmtlib/fmt
 [submodule "third_party/fdk-aac"]
 	path = third_party/fdk-aac
 	url = https://github.com/Panda3DS-emu/fdk-aac/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -26,7 +26,7 @@ if(APPLE)
 endif()

 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security -Wno-invalid-offsetof")
 endif()

 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@ -55,6 +55,7 @@ option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON)
 option(BUILD_HYDRA_CORE "Build a Hydra core" OFF)
 option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF)
 option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON)
+option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF)

 set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.")
 set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES)
@ -147,11 +148,13 @@ if (NOT ANDROID)
    target_link_libraries(AlberCore PUBLIC SDL2-static)
 endif()

+add_subdirectory(third_party/fmt)
 add_subdirectory(third_party/toml11)
 include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 include_directories(third_party/glm)
 include_directories(third_party/renderdoc)
+include_directories(third_party/duckstation)

 add_subdirectory(third_party/cmrc)

@ -210,6 +213,13 @@ else()
    set(HOST_ARM64 FALSE)
 endif()

+# Enable SSE4.1 if it's not explicitly disabled
+# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+endif()
+
 if(ENABLE_RENDERDOC_API)
    find_package(RenderDoc 1.6.0 MODULE REQUIRED)
    add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC)
@ -258,13 +268,13 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
                         src/core/services/act.cpp src/core/services/nfc.cpp src/core/services/dlp_srvr.cpp
                         src/core/services/ir_user.cpp src/core/services/http.cpp src/core/services/soc.cpp
                         src/core/services/ssl.cpp src/core/services/news_u.cpp src/core/services/amiibo_device.cpp
-                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp
+                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp src/core/services/fonts.cpp
 )
 set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
                      src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
                      src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
                      src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
-                      src/core/PICA/shader_decompiler.cpp
+                      src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
 )

 set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@ -316,14 +326,15 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                 include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
                 include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                 include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
-                 include/sdl_sensors.hpp include/renderdoc.hpp include/audio/aac_decoder.hpp
+                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
+                 include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
 )

 cmrc_add_resource_library(
    resources_console_fonts
    NAMESPACE ConsoleFonts
    WHENCE "src/core/services/fonts/"
-    "src/core/services/fonts/CitraSharedFontUSRelocated.bin"
+    "src/core/services/fonts/SharedFontReplacement.bin"
 )

 set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
@ -349,7 +360,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
 endif()

 if(ENABLE_QT_GUI)
-    include_directories(third_party/duckstation)
    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)

    if(APPLE)
@ -382,7 +392,7 @@ if(ENABLE_OPENGL)
    set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
        include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
        include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/gl_driver.hpp
    )

    set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
@ -392,6 +402,8 @@ if(ENABLE_OPENGL)
        src/host_shaders/opengl_fragment_shader.frag
    )

+    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
+
    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
    source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})

@ -555,7 +567,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN
 target_sources(AlberCore PRIVATE ${ALL_SOURCES})

 target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra fdk-aac)
-target_link_libraries(AlberCore PUBLIC glad capstone)
+target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)

 if(ENABLE_DISCORD_RPC AND NOT ANDROID)
    target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")
--- a/docs/img/KirbyAndroid.png
+++ b/docs/img/KirbyAndroid.png
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@ -0,0 +1,45 @@
+#pragma once
+
+#include <array>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+			u32 stride;
+
+			u8 type;
+			u8 componentCount;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
+		u8* indexBuffer;
+
+		// Minimum and maximum index in the index buffer for a draw call
+		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
+		u32 totalLoaderCount;
+		u32 enabledAttributeMask;
+		u32 fixedAttributes;
+		u32 vertexDataSize;
+
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;
+
+		bool canBeAccelerated;
+		bool indexed;
+		bool useShortIndices;
+	};
+}  // namespace PICA
--- a/include/PICA/dynapica/pica_recs.hpp
+++ b/include/PICA/dynapica/pica_recs.hpp
@ -2,7 +2,7 @@
 #include "helpers.hpp"
 #include "vertex_loader_rec.hpp"

-// Common file for our PICA JITs (From vertex config -> CPU assembly and from PICA shader -> CPU assembly)
+// Common file for our PICA JITs (From PICA shader -> CPU assembly)

 namespace Dynapica {
 #ifdef PANDA3DS_DYNAPICA_SUPPORTED
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@ -1,6 +1,7 @@
 #pragma once
 #include <array>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/dynapica/shader_rec.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_vertex.hpp"
@ -13,6 +14,12 @@
 #include "memory.hpp"
 #include "renderer.hpp"

+enum class ShaderExecMode {
+	Interpreter,  // Interpret shaders on the CPU
+	JIT,          // Recompile shaders to CPU machine code
+	Hardware,     // Recompiler shaders to host shaders and run them on the GPU
+};
+
 class GPU {
 	static constexpr u32 regNum = 0x300;
 	static constexpr u32 extRegNum = 0x1000;
@ -45,7 +52,7 @@ class GPU {
 	uint immediateModeVertIndex;
 	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading

-	template <bool indexed, bool useShaderJIT>
+	template <bool indexed, ShaderExecMode mode>
 	void drawArrays();

 	// Silly method of avoiding linking problems. TODO: Change to something less silly
@ -81,6 +88,7 @@ class GPU {
 	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();

+	void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
  public:
 	// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
 	// Encoded in PICA native format
--- a/include/PICA/pica_frag_config.hpp
+++ b/include/PICA/pica_frag_config.hpp
@ -17,6 +17,7 @@ namespace PICA {
 			// enable == off means a CompareFunction of Always
 			BitField<0, 3, CompareFunction> alphaTestFunction;
 			BitField<3, 1, u32> depthMapEnable;
+			BitField<4, 4, LogicOpMode> logicOpMode;
 		};
 	};

@ -214,6 +215,10 @@ namespace PICA {
 				(alphaTestConfig & 1) ? static_cast<PICA::CompareFunction>(alphaTestFunction) : PICA::CompareFunction::Always;
 			outConfig.depthMapEnable = regs[InternalRegs::DepthmapEnable] & 1;

+			// Shows if blending is enabled. If it is not enabled, then logic ops are enabled instead
+			const bool blendingEnabled = (regs[InternalRegs::ColourOperation] & (1 << 8)) != 0;
+			outConfig.logicOpMode = blendingEnabled ? LogicOpMode::Copy : LogicOpMode(Helpers::getBits<0, 4>(regs[InternalRegs::LogicOp]));
+
 			texConfig.texUnitConfig = regs[InternalRegs::TexUnitCfg];
 			texConfig.texEnvUpdateBuffer = regs[InternalRegs::TexEnvUpdateBuffer];

--- a/include/PICA/pica_simd.hpp
+++ b/include/PICA/pica_simd.hpp
@ -0,0 +1,274 @@
+#pragma once
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include "helpers.hpp"
+
+#if defined(_M_AMD64) || defined(__x86_64__)
+#define PICA_SIMD_X64
+#include <immintrin.h>
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define PICA_SIMD_ARM64
+#include <arm_neon.h>
+#endif
+
+// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
+namespace PICA::IndexBuffer {
+	// Non-SIMD, portable algorithm
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+
+#ifdef PICA_SIMD_ARM64
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// 16-bit indices
+			uint16x8_t minima = vdupq_n_u16(0xffff);
+			uint16x8_t maxima = vdupq_n_u16(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
+				minima = vminq_u16(data, minima);
+				maxima = vmaxq_u16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
+			// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
+			// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
+			// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
+
+			uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
+			uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
+			uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
+
+			uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
+			uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
+			uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
+
+			minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
+			maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
+		} else {
+			// 8-bit indices
+			uint8x16_t minima = vdupq_n_u8(0xff);
+			uint8x16_t maxima = vdupq_n_u8(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				uint8x16_t data = vld1q_u8(indexBuffer);
+				minima = vminq_u8(data, minima);
+				maxima = vmaxq_u8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
+			uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
+			uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
+			uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
+			uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
+
+			uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
+			uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
+			uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
+			uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
+
+			minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
+			maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
+		}
+
+		// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16
+		// indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
+
+			auto horizontalMax16 = [](__m128i vector) -> u16 {
+				// We have an instruction to compute horizontal minimum but not maximum, so we use it.
+				// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
+				__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
+				u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
+				return u16(min ^ 0xffff);
+			};
+
+			// 16-bit indices
+			// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
+			// And the maxima vector to all 0s (0 for each 16-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu16(data, minima);
+				maxima = _mm_max_epu16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin16(minima));
+			maximumIndex = u16(horizontalMax16(maxima));
+		} else {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin8 = [](__m128i vector) -> u8 {
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			auto horizontalMax8 = [](__m128i vector) -> u8 {
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			// 8-bit indices
+			// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
+			// And the maxima vector to all 0s (0 for each 8-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu8(data, minima);
+				maxima = _mm_max_epu8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin8(minima));
+			maximumIndex = u16(horizontalMax8(maxima));
+		}
+
+		// If any indices could not be processed cause the buffer size
+		// is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index
+		// buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+	// Analyzes a PICA index buffer to get the minimum and maximum indices in the
+	// buffer, and returns them in a pair in the form [min, max]. Takes a template
+	// parameter to decide whether the indices in the buffer are u8 or u16
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
+#if defined(PICA_SIMD_ARM64)
+		return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
+#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+		// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
+		return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
+#else
+		return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+#endif
+	}
+
+	// In some really unfortunate scenarios (eg Android Studio emulator), we don't have access to glDrawRangeElementsBaseVertex
+	// So we need to subtract the base vertex index from every index in the index buffer ourselves
+	// This is not really common, so we do it without SIMD for the moment, just to be able to run on Android Studio
+	template <bool useShortIndices>
+	void subtractBaseIndex(u8* indexBuffer, u32 indexCount, u16 baseIndex) {
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer16[i] -= baseIndex;
+			}
+		} else {
+			u8 baseIndex8 = u8(baseIndex);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer[i] -= baseIndex8;
+			}
+		}
+	}
+}  // namespace PICA::IndexBuffer
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@ -0,0 +1,57 @@
+#pragma once
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	// Configuration struct used
+	struct VertConfig {
+		PICAHash::HashType shaderHash;
+		PICAHash::HashType opdescHash;
+		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
+		bool usingUbershader;
+
+		// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
+		// As the padding will get hashed and memcmp'd...
+		u32 pad{};
+
+		bool operator==(const VertConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
+		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				// Mask out unused bits
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
+			}
+		}
+	};
+}  // namespace PICA
+
+static_assert(sizeof(PICA::VertConfig) == 56);
+
+// Override std::hash for our vertex config class
+template <>
+struct std::hash<PICA::VertConfig> {
+	std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
--- a/include/PICA/regs.hpp
+++ b/include/PICA/regs.hpp
@ -396,6 +396,25 @@ namespace PICA {
 		GreaterOrEqual = 7,
 	};

+	enum class LogicOpMode : u32 {
+		Clear = 0,
+		And = 1,
+		ReverseAnd = 2,
+		Copy = 3,
+		Set = 4,
+		InvertedCopy = 5,
+		Nop = 6,
+		Invert = 7,
+		Nand = 8,
+		Or = 9,
+		Nor = 10,
+		Xor = 11,
+		Equiv = 12,
+		InvertedAnd = 13,
+		ReverseOr = 14,
+		InvertedOr = 15,
+	};
+
 	enum class FogMode : u32 {
 		Disabled = 0,
 		Fog = 5,
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@ -107,6 +107,11 @@ class PICAShader {
 	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
 	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
+	
+	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
+	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
+	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
+	using Hash = PICAHash::HashType;

  protected:
 	std::array<u32, 128> operandDescriptors;
@ -125,14 +130,13 @@ class PICAShader {
 	std::array<CallInfo, 4> callInfo;
 	ShaderType type;

-	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
-	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
-	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
-	using Hash = PICAHash::HashType;
-
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)

+  public:
+	bool uniformsDirty = false;
+
+  protected:
 	bool codeHashDirty = false;
 	bool opdescHashDirty = false;

@ -284,6 +288,7 @@ class PICAShader {
 				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
 				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
+			uniformsDirty = true;
 		}
 	}

@ -295,6 +300,12 @@ class PICAShader {
 		u[1] = getBits<8, 8>(word);
 		u[2] = getBits<16, 8>(word);
 		u[3] = getBits<24, 8>(word);
+		uniformsDirty = true;
+	}
+
+	void uploadBoolUniform(u32 value) {
+		boolUniform = value;
+		uniformsDirty = true;
 	}

 	void run();
@ -302,6 +313,10 @@ class PICAShader {

 	Hash getCodeHash();
 	Hash getOpdescHash();
+
+	// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
+	static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
+	void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
 };

 static_assert(
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@ -1,8 +1,11 @@
 #pragma once
+#include <fmt/format.h>
+
+#include <map>
 #include <set>
 #include <string>
 #include <tuple>
-#include <map>
+#include <utility>
 #include <vector>

 #include "PICA/shader.hpp"
@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
 			explicit Function(u32 start, u32 end) : start(start), end(end) {}
 			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }

-			std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
-			std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
-			std::string getCallStatement() const { return getIdentifier() + "()"; }
+			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
+			// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
+			// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
+			// from within functions deep in the callstack
+			std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
+			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
 		};

 		std::set<Function> functions{};
@ -93,9 +99,11 @@ namespace PICA::ShaderGen {

 		API api;
 		Language language;
+		bool compilationError = false;

 		void compileInstruction(u32& pc, bool& finished);
-		void compileRange(const AddressRange& range);
+		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
+		std::pair<u32, bool> compileRange(const AddressRange& range);
 		void callFunction(const Function& function);
 		const Function* findFunction(const AddressRange& range);

@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
 		std::string getDest(u32 dest) const;
 		std::string getSwizzlePattern(u32 swizzle) const;
 		std::string getDestSwizzle(u32 destinationMask) const;
+		const char* getCondition(u32 cond, u32 refX, u32 refY);

 		void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
 		// Returns if the instruction uses the typical register encodings most instructions use
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@ -3,6 +3,7 @@

 #include "PICA/gpu.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen_types.hpp"
 #include "helpers.hpp"
@ -25,11 +26,14 @@ namespace PICA::ShaderGen {
 		bool isSamplerEnabled(u32 environmentID, u32 lutID);

 		void compileFog(std::string& shader, const PICA::FragmentConfig& config);
+		void compileLogicOps(std::string& shader, const PICA::FragmentConfig& config);

 	  public:
 		FragmentGenerator(API api, Language language) : api(api), language(language) {}
-		std::string generate(const PICA::FragmentConfig& config);
+		std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
 		std::string getDefaultVertexShader();
+		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);

 		void setTarget(API api, Language language) {
 			this->api = api;
--- a/include/PICA/shader_unit.hpp
+++ b/include/PICA/shader_unit.hpp
@ -2,10 +2,9 @@
 #include "PICA/shader.hpp"

 class ShaderUnit {
-
-public:
-	PICAShader vs; // Vertex shader
-	PICAShader gs; // Geometry shader
+  public:
+	PICAShader vs;  // Vertex shader
+	PICAShader gs;  // Geometry shader

 	ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
 	void reset();
--- a/include/align.hpp
+++ b/include/align.hpp
@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _WIN32
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
--- a/include/audio/dsp_shared_mem.hpp
+++ b/include/audio/dsp_shared_mem.hpp
@ -324,8 +324,8 @@ namespace Audio::HLE {
 			BitField<15, 1, u32> outputBufferCountDirty;
 			BitField<16, 1, u32> masterVolumeDirty;

-			BitField<24, 1, u32> auxReturnVolume0Dirty;
-			BitField<25, 1, u32> auxReturnVolume1Dirty;
+			BitField<24, 1, u32> auxVolume0Dirty;
+			BitField<25, 1, u32> auxVolume1Dirty;
 			BitField<26, 1, u32> outputFormatDirty;
 			BitField<27, 1, u32> clippingModeDirty;
 			BitField<28, 1, u32> headphonesConnectedDirty;
@ -337,7 +337,7 @@ namespace Audio::HLE {
 		/// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for
 		/// each at the final mixer.
 		float_le masterVolume;
-		std::array<float_le, 2> auxReturnVolume;
+		std::array<float_le, 2> auxVolumes;

 		u16_le outputBufferCount;
 		u16 pad1[2];
@ -422,7 +422,7 @@ namespace Audio::HLE {

 	struct DspStatus {
 		u16_le unknown;
-		u16_le dropped_frames;
+		u16_le droppedFrames;
 		u16 pad0[0xE];
 	};
 	ASSERT_DSP_STRUCT(DspStatus, 32);
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@ -95,8 +95,7 @@ namespace Audio {
 		DSPSource() { reset(); }
 	};

-	class HLE_DSP : public DSPCore {
-		// The audio frame types are public in case we want to use them for unit tests
+	class DSPMixer {
 	  public:
 		template <typename T, usize channelCount = 1>
 		using Sample = std::array<T, channelCount>;
@ -113,6 +112,43 @@ namespace Audio {
 		template <typename T>
 		using QuadFrame = Frame<T, 4>;

+	  private:
+		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
+		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
+		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
+		static constexpr usize mixerStageCount = 3;
+
+	  public:
+		ChannelFormat channelFormat = ChannelFormat::Stereo;
+		std::array<float, mixerStageCount> volumes;
+		std::array<bool, 2> enableAuxStages;
+
+		void reset() {
+			channelFormat = ChannelFormat::Stereo;
+
+			volumes.fill(0.0);
+			enableAuxStages.fill(false);
+		}
+	};
+
+	class HLE_DSP : public DSPCore {
+		// The audio frame types are public in case we want to use them for unit tests
+	  public:
+		template <typename T, usize channelCount = 1>
+		using Sample = DSPMixer::Sample<T, channelCount>;
+
+		template <typename T, usize channelCount>
+		using Frame = DSPMixer::Frame<T, channelCount>;
+
+		template <typename T>
+		using MonoFrame = DSPMixer::MonoFrame<T>;
+
+		template <typename T>
+		using StereoFrame = DSPMixer::StereoFrame<T>;
+
+		template <typename T>
+		using QuadFrame = DSPMixer::QuadFrame<T>;
+
 		using Source = Audio::DSPSource;
 		using SampleBuffer = Source::SampleBuffer;

@ -131,6 +167,7 @@ namespace Audio {
 		std::array<Source, Audio::HLE::sourceCount> sources;  // DSP voices
 		Audio::HLE::DspMemory dspRam;

+		Audio::DSPMixer mixer;
 		std::unique_ptr<Audio::AAC::Decoder> aacDecoder;

 		void resetAudioPipe();
@ -175,10 +212,13 @@ namespace Audio {

 		void handleAACRequest(const AAC::Message& request);
 		void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
+		void updateMixerConfig(HLE::SharedMemory& sharedMem);
 		void generateFrame(StereoFrame<s16>& frame);
 		void generateFrame(DSPSource& source);
 		void outputFrame();
-
+		// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
+		void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
+		
 		// Decode an entire buffer worth of audio
 		void decodeBuffer(DSPSource& source);

--- a/include/config.hpp
+++ b/include/config.hpp
@ -20,18 +20,20 @@ struct EmulatorConfig {
 #else
 	static constexpr bool ubershaderDefault = true;
 #endif
-
+	static constexpr bool accelerateShadersDefault = true;
+	
 	bool shaderJitEnabled = shaderJitDefault;
-	bool discordRpcEnabled = false;
 	bool useUbershaders = ubershaderDefault;
+	bool accelerateShaders = accelerateShadersDefault;
 	bool accurateShaderMul = false;
+	bool discordRpcEnabled = false;

 	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
 	bool forceShadergenForLights = true;
 	int lightShadergenThreshold = 1;

 	RendererType rendererType = RendererType::OpenGL;
-	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;
+	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::HLE;

 	bool sdCardInserted = true;
 	bool sdWriteProtected = false;
--- a/include/memory.hpp
+++ b/include/memory.hpp
@ -298,5 +298,5 @@ private:

 	bool allocateMainThreadStack(u32 size);
 	Regions getConsoleRegion();
-	void copySharedFont(u8* ptr);
+	void copySharedFont(u8* ptr, u32 vaddr);
 };
--- a/include/renderdoc.hpp
+++ b/include/renderdoc.hpp
@ -35,4 +35,35 @@ namespace Renderdoc {
 	static void setOutputDir(const std::string& path, const std::string& prefix) {}
 	static constexpr bool isSupported() { return false; }
 }  // namespace Renderdoc
-#endif
+#endif
+
+namespace Renderdoc {
+	// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
+	struct Scope {
+		Scope() { Renderdoc::startCapture(); }
+		~Scope() { Renderdoc::endCapture(); }
+
+		Scope(const Scope&) = delete;
+		Scope& operator=(const Scope&) = delete;
+
+		Scope(Scope&&) = delete;
+		Scope& operator=(const Scope&&) = delete;
+	};
+
+	// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
+	// trigger on its own and take a capture
+	struct InstantScope {
+		InstantScope() {
+			Renderdoc::triggerCapture();
+			Renderdoc::startCapture();
+		}
+
+		~InstantScope() { Renderdoc::endCapture(); }
+		
+		InstantScope(const InstantScope&) = delete;
+		InstantScope& operator=(const InstantScope&) = delete;
+
+		InstantScope(InstantScope&&) = delete;
+		InstantScope& operator=(const InstantScope&&) = delete;
+	};
+}  // namespace Renderdoc
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@ -1,9 +1,10 @@
 #pragma once
 #include <array>
+#include <optional>
 #include <span>
 #include <string>
-#include <optional>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "helpers.hpp"
@ -22,9 +23,11 @@ enum class RendererType : s8 {
 };

 struct EmulatorConfig;
-class GPU;
 struct SDL_Window;

+class GPU;
+class ShaderUnit;
+
 class Renderer {
  protected:
 	GPU& gpu;
@ -78,7 +81,11 @@ class Renderer {
 	virtual std::string getUbershader() { return ""; }
 	virtual void setUbershader(const std::string& shader) {}

-	virtual void setUbershaderSetting(bool value) {}
+	// This function is called on every draw call before parsing vertex data.
+	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
+	// ubershaders and shadergen, and so on.
+	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }

 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
--- a/include/renderer_gl/gl_driver.hpp
+++ b/include/renderer_gl/gl_driver.hpp
@ -0,0 +1,12 @@
+#pragma once
+
+// Information about our OpenGL/OpenGL ES driver that we should keep track of
+// Stuff like whether specific extensions are supported, and potentially things like OpenGL context information
+namespace OpenGL {
+	struct Driver {
+		bool supportsExtFbFetch = false;
+		bool supportsArmFbFetch = false;
+
+		bool supportFbFetch() const { return supportsExtFbFetch || supportsArmFbFetch; }
+	};
+}  // namespace OpenGL
--- a/include/renderer_gl/gl_state.hpp
+++ b/include/renderer_gl/gl_state.hpp
@ -38,7 +38,6 @@ struct GLStateManager {
 	
 	GLuint stencilMask;
 	GLuint boundVAO;
-	GLuint boundVBO;
 	GLuint currentProgram;
 	GLuint boundUBO;

@ -173,13 +172,6 @@ struct GLStateManager {
 		}
 	}

-	void bindVBO(GLuint handle) {
-		if (boundVBO != handle) {
-			boundVBO = handle;
-			glBindBuffer(GL_ARRAY_BUFFER, handle);
-		}
-	}
-
 	void useProgram(GLuint handle) {
 		if (currentProgram != handle) {
 			currentProgram = handle;
@ -195,7 +187,6 @@ struct GLStateManager {
 	}

 	void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
-	void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
 	void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }

 	void setColourMask(bool r, bool g, bool b, bool a) {
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -3,15 +3,21 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <memory>
+#include <optional>
 #include <span>
 #include <unordered_map>
+#include <utility>

 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
+#include "gl_driver.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@ -28,9 +34,11 @@ class RendererGL final : public Renderer {
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;

-	OpenGL::VertexArray vao;
+	// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
+	OpenGL::VertexArray defaultVAO;
+	// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
+	OpenGL::VertexArray hwShaderVAO;
 	OpenGL::VertexBuffer vbo;
-	bool enableUbershader = true;

 	// Data 
 	struct {
@ -53,6 +61,21 @@ class RendererGL final : public Renderer {
 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
+	// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
+	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
+	bool usingShortIndices = false;
+
+	// Set by prepareForDraw, metadata for indexed renders
+	GLuint minimumIndex = 0;
+	GLuint maximumIndex = 0;
+	void* hwIndexBufferOffset = nullptr;
+
+	// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
+	u32 previousAttributeMask = 0;
+
+	// Cached pointer to the current vertex shader when using HW accelerated shaders
+	OpenGL::Shader* generatedVertexShader = nullptr;

 	SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
 	SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -70,18 +93,58 @@ class RendererGL final : public Renderer {
 	// We can compile this once and then link it with all other generated fragment shaders
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;
+	// UBO for uploading the PICA uniforms when using hw shaders
+	GLuint hwShaderUniformUBO;
+
+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
+	// Cache of fixed attribute values so that we don't do any duplicate updates
+	std::array<std::array<float, 4>, 16> fixedAttrValues;

 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
 	};
-	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
+
+	struct ShaderCache {
+		std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
+		std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
+
+		// Program cache indexed by GLuints for the vertex and fragment shader to use
+		// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
+		std::unordered_map<u64, CachedProgram> programCache;
+
+		void clear() {
+			for (auto& it : programCache) {
+				CachedProgram& cachedProgram = it.second;
+				cachedProgram.program.free();
+			}
+
+			for (auto& it : vertexShaderCache) {
+				if (it.second.has_value()) {
+					it.second->free();
+				}
+			}
+
+			for (auto& it : fragmentShaderCache) {
+				it.second.free();
+			}
+
+			programCache.clear();
+			vertexShaderCache.clear();
+			fragmentShaderCache.clear();
+		}
+	};
+	ShaderCache shaderCache;

 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
 	OpenGL::Program& getSpecializedShader();

 	PICA::ShaderGen::FragmentGenerator fragShaderGen;
+	OpenGL::Driver driverInfo;

 	MAKE_LOG_FUNCTION(log, rendererLogger)
 	void setupBlending();
@ -93,6 +156,8 @@ class RendererGL final : public Renderer {
 	void updateFogLUT();
 	void initGraphicsContextInternal();

+	void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
+
  public:
 	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
 		: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@ -110,15 +175,13 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-
-	virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);

 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
 	void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
 	void resetStateManager() { gl.reset(); }
-	void clearShaderCache();
 	void initUbershader(OpenGL::Program& program);

 #ifdef PANDA3DS_FRONTEND_QT
--- a/include/sdl_sensors.hpp
+++ b/include/sdl_sensors.hpp
@ -2,31 +2,37 @@

 #include <cmath>
 #include <glm/glm.hpp>
-#include <numbers>

 #include "helpers.hpp"
 #include "services/hid.hpp"

+// Convert SDL sensor readings to 3DS format
+// We use the same code for Android as well, since the values we get from Android are in the same format as SDL (m/s^2 for acceleration, rad/s for
+// rotation)
 namespace Sensors::SDL {
-	// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
-	// Returns [pitch, roll, yaw]
-	static glm::vec3 convertRotation(glm::vec3 rotation) {
-		// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
-		constexpr float scale = 180.f / std::numbers::pi * HIDService::gyroscopeCoeff;
-		// The axes are also inverted, so invert scale before the multiplication.
-		return rotation * -scale;
-	}
+    // Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
+    // Returns [pitch, roll, yaw]
+    static glm::vec3 convertRotation(glm::vec3 rotation) {
+        // Annoyingly, Android doesn't support the <numbers> header yet so we define pi ourselves
+        static constexpr double pi = 3.141592653589793;
+        // Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
+        constexpr float scale = 180.f / pi * HIDService::gyroscopeCoeff;
+        // The axes are also inverted, so invert scale before the multiplication.
+        return rotation * -scale;
+    }

-	static glm::vec3 convertAcceleration(float* data) {
-		// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
-		// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
-		// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
-		static constexpr float accelMax = 9.f;
+    static glm::vec3 convertAcceleration(float* data) {
+        // Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
+        // At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
+        // This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
+        static constexpr float accelMax = 9.f;
+        // We define standard gravity(g) ourself instead of using the SDL one in order for the code to work on Android too.
+        static constexpr float standardGravity = 9.80665f;

-		s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
-		s16 y = std::clamp<s16>(s16(data[1] / (SDL_STANDARD_GRAVITY * accelMax) * 930.f - 350.f), -930, +930);
-		s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
+        s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
+        s16 y = std::clamp<s16>(s16(data[1] / (standardGravity * accelMax) * 930.f - 350.f), -930, +930);
+        s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);

-		return glm::vec3(x, y, z);
-	}
+        return glm::vec3(x, y, z);
+    }
 }  // namespace Sensors::SDL
--- a/include/services/fonts.hpp
+++ b/include/services/fonts.hpp
@ -0,0 +1,84 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.h
+
+#pragma once
+
+#include <memory>
+
+#include "helpers.hpp"
+#include "swap.hpp"
+
+namespace HLE::Fonts {
+	struct CFNT {
+		u8 magic[4];
+		u16_le endianness;
+		u16_le headerSize;
+		u32_le version;
+		u32_le fileSize;
+		u32_le numBlocks;
+	};
+
+	struct SectionHeader {
+		u8 magic[4];
+		u32_le sectionSize;
+	};
+
+	struct FINF {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 fontType;
+		u8 lineFeed;
+		u16_le alterCharIndex;
+		u8 default_width[3];
+		u8 encoding;
+		u32_le tglpOffset;
+		u32_le cwdhOffset;
+		u32_le cmapOffset;
+		u8 height;
+		u8 width;
+		u8 ascent;
+		u8 reserved;
+	};
+
+	struct TGLP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 cellWidth;
+		u8 cellHeight;
+		u8 baselinePosition;
+		u8 maxCharacterWidth;
+		u32_le sheetSize;
+		u16_le numSheets;
+		u16_le sheetImageFormat;
+		u16_le numColumns;
+		u16_le numRows;
+		u16_le sheetWidth;
+		u16_le sheetHeight;
+		u32_le sheetDataOffset;
+	};
+
+	struct CMAP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le codeBegin;
+		u16_le codeEnd;
+		u16_le mappingMethod;
+		u16_le reserved;
+		u32_le nextCmapOffset;
+	};
+
+	struct CWDH {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le startIndex;
+		u16_le endIndex;
+		u32_le nextCwdhOffset;
+	};
+
+	// Relocates the internal addresses of the BCFNT Shared Font to the new base. The current base will
+	// be auto-detected based on the file headers.
+	void relocateSharedFont(u8* sharedFont, u32 newAddress);
+}  // namespace HLE::Fonts
--- a/readme.md
+++ b/readme.md
@ -1,5 +1,5 @@
 # Panda3DS
-[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)
+[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)

 Panda3DS is an HLE, red-panda-themed Nintendo 3DS emulator written in C++ which started out as a fun project out of curiosity, but evolved into something that can sort of play games!

@ -10,7 +10,7 @@ Join our Discord server by pressing on the banner below, or find us on other pla

 [![Discord Banner 2](https://discord.com/api/guilds/1118695732958994532/widget.png?style=banner2)](https://discord.gg/ZYbugsEmsw)

-![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png)
+![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png) ![screenshot4](docs/img/KirbyAndroid.png)

 # Download
 You can download stable builds from the Releases tab, or you can download the latest build from the tables below. Additionally, Panda3DS comes in 2 flavours on PC: A minimal SDL frontend, which does not have a GUI, and an experimental Qt 6 frontend with a proper user interface.
@ -22,16 +22,16 @@ SDL builds (No GUI):
 |MacOS build|[![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/MacOS_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
 |Linux build|[![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Linux_AppImage_Build/master/Linux%20executable.zip)|

-Qt builds:
+Qt and Android builds:
 |Platform|Status|Download|
 |--------|------------|--------|
 |Windows build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Windows Executable](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Windows%20executable.zip)|
 |MacOS build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
 |Linux build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Linux%20executable.zip)|
-
+|Android build (arm64)|[![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml)|[Android APK](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Android_Build/master/Android%20APKs%20(arm64).zip)|

 # Compatibility
-Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is not supported, and some QoL features (including a GUI) are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!
+Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is WIP, and some QoL features are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!

 For documenting game compatibility, make sure to visit the [games list repository](https://github.com/Panda3DS-emu/Panda3DS-Games-List). For miscellaneous issues or more technical issues, feel free to use this repo's issues tab.
 # Why?
@ -116,7 +116,7 @@ Panda3DS also supports controller input using the SDL2 GameController API.
 - [MelonDS](https://github.com/melonDS-emu/melonDS): "DS emulator, sorta" - Arisotura
 - [Kaizen](https://github.com/SimoneN64/Kaizen): Experimental work-in-progress low-level N64 emulator
 - [ChonkyStation](https://github.com/liuk7071/ChonkyStation): Work-in-progress PlayStation emulator
- [shadPS4](https://github.com/georgemoralis/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
+- [shadPS4](https://github.com/shadps4-emu/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
 - [Hydra](https://github.com/hydra-emu/hydra): Cross-platform GameBoy, NES, N64 and Chip-8 emulator

 # Support
--- a/src/config.cpp
+++ b/src/config.cpp
@ -67,6 +67,7 @@ void EmulatorConfig::load() {
 			vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
 			useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
 			accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
+			accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);

 			forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
 			lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
@ -79,7 +80,7 @@ void EmulatorConfig::load() {
 		if (audioResult.is_ok()) {
 			auto audio = audioResult.unwrap();

-			auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "Null");
+			auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "HLE");
 			dspType = Audio::DSPCore::typeFromString(dspCoreName);
 			audioEnabled = toml::find_or<toml::boolean>(audio, "EnableAudio", false);
 		}
@ -141,6 +142,7 @@ void EmulatorConfig::save() {
 	data["GPU"]["UseUbershaders"] = useUbershaders;
 	data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
 	data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
+	data["GPU"]["AccelerateShaders"] = accelerateShaders;
 	data["GPU"]["EnableRenderdoc"] = enableRenderdoc;

 	data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@ -0,0 +1,137 @@
+#include "PICA/draw_acceleration.hpp"
+
+#include <bit>
+#include <tuple>
+
+#include "PICA/gpu.hpp"
+#include "PICA/pica_simd.hpp"
+#include "PICA/regs.hpp"
+
+void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
+	accel.indexed = indexed;
+	accel.totalAttribCount = totalAttribCount;
+	accel.enabledAttributeMask = 0;
+
+	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
+
+	if (indexed) {
+		u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
+		u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
+
+		u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Check whether the index buffer uses u16 indices or u8
+		accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if (accel.useShortIndices) {
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<true>(indexBuffer, vertexCount);
+		} else {
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<false>(indexBuffer, vertexCount);
+		}
+
+		accel.indexBuffer = indexBuffer;
+	} else {
+		accel.indexBuffer = nullptr;
+		accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
+		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
+	}
+
+	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
+	const u64 inputAttrCfg = getVertexShaderInputConfig();
+
+	u32 attrCount = 0;
+	u32 loaderOffset = 0;
+	accel.vertexDataSize = 0;
+	accel.totalLoaderCount = 0;
+
+	for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
+		auto& loaderData = attributeInfo[i];  // Get information for this attribute loader
+
+		// This loader is empty, skip it
+		if (loaderData.componentCount == 0 || loaderData.size == 0) {
+			continue;
+		}
+
+		auto& loader = accel.loaders[accel.totalLoaderCount++];
+
+		// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
+		// Which is equal to maximumIndex - minimumIndex + 1
+		const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
+		loader.size = bytes;
+
+		// Add it to the total vertex data size, aligned to 4 bytes.
+		accel.vertexDataSize += (bytes + 3) & ~3;
+
+		// Get a pointer to the data where this loader's data is stored
+		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
+		loader.data = getPointerPhys<u8>(loaderAddress);
+
+		u64 attrCfg = loaderData.getConfigFull();  // Get config1 | (config2 << 32)
+		u32 attributeOffset = 0;
+
+		for (int component = 0; component < loaderData.componentCount; component++) {
+			uint attributeIndex = (attrCfg >> (component * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+			// Vertex attributes used as padding
+			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+			if (attributeIndex >= 12) [[unlikely]] {
+				// Align attribute address up to a 4 byte boundary
+				attributeOffset = (attributeOffset + 3) & -4;
+				attributeOffset += (attributeIndex - 11) << 2;
+				continue;
+			}
+
+			const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
+			const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+			const u32 size = (attribInfo >> 2) + 1;   // Total number of components
+
+			// Size of each component based on the attribute type
+			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
+			// Mark the attribute as enabled
+			accel.enabledAttributeMask |= 1 << inputReg;
+
+			auto& attr = accel.attributeInfo[inputReg];
+			attr.componentCount = size;
+			attr.offset = attributeOffset + loaderOffset;
+			attr.stride = loaderData.size;
+			attr.type = attribType;
+			attributeOffset += size * sizePerComponent[attribType];
+		}
+
+		loaderOffset += loader.size;
+	}
+
+	u32 fixedAttributes = fixedAttribMask;
+	accel.fixedAttributes = 0;
+
+	// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
+	while (fixedAttributes != 0) {
+		// Get index of next fixed attribute and turn it off
+		const u32 index = std::countr_zero<u32>(fixedAttributes);
+		const u32 mask = 1u << index;
+		fixedAttributes ^= mask;
+
+		// PICA register this fixed attribute is meant to go to
+		const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
+		const u32 inputRegMask = 1u << inputReg;
+
+		// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
+		if ((accel.enabledAttributeMask & inputRegMask) == 0) {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
+			auto& attr = accel.attributeInfo[inputReg];
+
+			accel.fixedAttributes |= inputRegMask;
+
+			for (int i = 0; i < 4; i++) {
+				attr.fixedValue[i] = fixedAttr[i].toFloat32();
+			}
+		}
+	}
+
+	accel.canBeAccelerated = true;
+}
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -126,37 +126,62 @@ void GPU::reset() {
 	externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
 	externalRegs[Framebuffer1Select] = 0;

-	renderer->setUbershaderSetting(config.useUbershaders);
 	renderer->reset();
 }

-// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
-// And whether we are going to use the shader JIT (second template parameter)
-void GPU::drawArrays(bool indexed) {
-	const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
-
-	if (indexed) {
-		if (shaderJITEnabled)
-			drawArrays<true, true>();
-		else
-			drawArrays<true, false>();
-	} else {
-		if (shaderJITEnabled)
-			drawArrays<false, true>();
-		else
-			drawArrays<false, false>();
-	}
-}
-
 static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;

-template <bool indexed, bool useShaderJIT>
-void GPU::drawArrays() {
-	if constexpr (useShaderJIT) {
-		shaderJIT.prepare(shaderUnit.vs);
+// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
+// And whether we are going to use the shader JIT (second template parameter)
+void GPU::drawArrays(bool indexed) {
+	PICA::DrawAcceleration accel;
+
+	if (config.accelerateShaders) {
+		// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
+		// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on 
+		getAcceleratedDrawInfo(accel, indexed);
 	}

-	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
+
+	if (hwShaders) {
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
+		// Total # of vertices to render
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
+
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
+		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
+	} else {
+		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
+
+		if (indexed) {
+			if (shaderJITEnabled) {
+				drawArrays<true, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<true, ShaderExecMode::Interpreter>();
+			}
+		} else {
+			if (shaderJITEnabled) {
+				drawArrays<false, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<false, ShaderExecMode::Interpreter>();
+			}
+		}
+	}
+}
+
+template <bool indexed, ShaderExecMode mode>
+void GPU::drawArrays() {
+	if constexpr (mode == ShaderExecMode::JIT) {
+		shaderJIT.prepare(shaderUnit.vs);
+	} else if constexpr (mode == ShaderExecMode::Hardware) {
+		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
+		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
+	}
+
+	// We can have up to 16 attributes, each one consisting of 4 floats
+	constexpr u32 maxAttrSizeInFloats = 16 * 4;

 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@ -321,8 +346,6 @@ void GPU::drawArrays() {
 					}

 					// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
-					// Corgi does this although I'm not sure if it's actually needed for anything.
-					// TODO: Find out
 					while (component < 4) {
 						attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
 						component++;
@ -336,13 +359,13 @@ void GPU::drawArrays() {

 		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
 		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-		// Ie it might attribute #0 to v2, #1 to v7, etc
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
 		for (int j = 0; j < totalAttribCount; j++) {
 			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}

-		if constexpr (useShaderJIT) {
+		if constexpr (mode == ShaderExecMode::JIT) {
 			shaderJIT.run(shaderUnit.vs);
 		} else {
 			shaderUnit.vs.run();
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
+							renderer->prepareForDraw(shaderUnit, nullptr);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);

 							switch (primType) {
@ -300,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		}

 		case VertexBoolUniform: {
-			shaderUnit.vs.boolUniform = value & 0xffff;
+			shaderUnit.vs.uploadBoolUniform(value & 0xffff);
 			break;
 		}

--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@ -1,5 +1,10 @@
 #include "PICA/shader_decompiler.hpp"

+#include <fmt/format.h>
+
+#include <array>
+#include <cassert>
+
 #include "config.hpp"

 using namespace PICA;
@ -13,11 +18,45 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
 	analysisFailed = false;

 	const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
-	if (function == nullptr) {
+	if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
 		analysisFailed = true;
 	}
 }

+// Helpers for merging parallel/series exit methods from Citra
+// Merges exit method of two parallel branches.
+static ExitMode exitParallel(ExitMode a, ExitMode b) {
+	if (a == ExitMode::Unknown) {
+		return b;
+	}
+	else if (b == ExitMode::Unknown) {
+		return a;
+	}
+	else if (a == b) {
+		return a;
+	}
+	return ExitMode::Conditional;
+}
+
+// Cascades exit method of two blocks of code.
+static ExitMode exitSeries(ExitMode a, ExitMode b) {
+	assert(a != ExitMode::AlwaysEnd);
+
+	if (a == ExitMode::Unknown) {
+		return ExitMode::Unknown;
+	}
+
+	if (a == ExitMode::AlwaysReturn) {
+		return b;
+	}
+
+	if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
+		return ExitMode::AlwaysEnd;
+	}
+
+	return ExitMode::Conditional;
+}
+
 ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
 	// Initialize exit mode to unknown by default, in order to detect things like unending loops
 	auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
@ -32,25 +71,132 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		const u32 opcode = instruction >> 26;

 		switch (opcode) {
-			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
-			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
-			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
-			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
-			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
-			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
-			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
-			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
-			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
+			case ShaderOpcodes::JMPC:
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				// Register this jump address to our outLabels set
+				labels.insert(dest);

+				// This opens up 2 parallel paths of execution
+				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
+				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
+				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
+				return it->second;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+
+				const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Next analyze the not taken func
+				ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
+				if (num != 0) {
+					const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
+					// Check if analysis failed and return unknown if it did
+					if (analysisFailed) {
+						it->second = ExitMode::Unknown;
+						return it->second;
+					}
+
+					branchNotTakenExitMode = branchNotTakenFunc->exitMode;
+				}
+
+				auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
+				// Both branches of the if/else end, so there's nothing after the call
+				if (parallel == ExitMode::AlwaysEnd) {
+					it->second = parallel;
+					return it->second;
+				} else {
+					ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
+					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
+					it->second = conditionalExitMode;
+					return it->second;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::CALL: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				u32 dest = getBits<10, 12>(instruction);
+				const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
+				const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
+				it->second = exitMode;
+				return it->second;
+			}
+
+			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 			default: break;
 		}
 	}

 	// A function without control flow instructions will always reach its "return point" and return
-	return ExitMode::AlwaysReturn;
+	it->second = ExitMode::AlwaysReturn;
+	return it->second;
 }

-void ShaderDecompiler::compileRange(const AddressRange& range) {
+std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
 	u32 pc = range.start;
 	const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
 	bool finished = false;
@ -58,6 +204,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
 	while (pc < end && !finished) {
 		compileInstruction(pc, finished);
 	}
+
+	return std::make_pair(pc, finished);
 }

 const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
@ -71,20 +219,43 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 }

 void ShaderDecompiler::writeAttributes() {
+	// Annoyingly, GLES does not support having an array as an input attribute, so declare each attribute separately for now
 	decompiledShader += R"(
-		layout(location = 0) in vec4 inputs[8];
+	layout(location = 0) in vec4 attr0;
+	layout(location = 1) in vec4 attr1;
+	layout(location = 2) in vec4 attr2;
+	layout(location = 3) in vec4 attr3;
+	layout(location = 4) in vec4 attr4;
+	layout(location = 5) in vec4 attr5;
+	layout(location = 6) in vec4 attr6;
+	layout(location = 7) in vec4 attr7;
+	layout(location = 8) in vec4 attr8;
+	layout(location = 9) in vec4 attr9;
+	layout(location = 10) in vec4 attr10;
+	layout(location = 11) in vec4 attr11;
+	layout(location = 12) in vec4 attr12;
+	layout(location = 13) in vec4 attr13;
+	layout(location = 14) in vec4 attr14;
+	layout(location = 15) in vec4 attr15;

-		layout(std140) uniform PICAShaderUniforms {
-			vec4 uniform_float[96];
-			uvec4 uniform_int;
-			uint uniform_bool;
-		};
-	
-		vec4 temp_registers[16];
-		vec4 dummy_vec = vec4(0.0);
+	layout(std140) uniform PICAShaderUniforms {
+		vec4 uniform_f[96];
+		uvec4 uniform_i;
+		uint uniform_bool;
+	};
+
+	vec4 temp[16];
+	vec4 out_regs[16];
+	vec4 dummy_vec = vec4(0.0);
+	ivec3 addr_reg = ivec3(0);
+	bvec2 cmp_reg = bvec2(false);
+
+	vec4 uniform_indexed(int source, int offset) {
+		int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
+		uint index = uint(clipped_offs + source) & 127u;
+		return (index < 96u) ? uniform_f[index] : vec4(1.0);
+	}
 )";
-
-	decompiledShader += "\n";
 }

 std::string ShaderDecompiler::decompile() {
@ -94,11 +265,14 @@ std::string ShaderDecompiler::decompile() {
 		return "";
 	}

-	decompiledShader = "";
+	compilationError = false;
+	decompiledShader.clear();
+	// Reserve some memory for the shader string to avoid memory allocations
+	decompiledShader.reserve(256 * 1024);

 	switch (api) {
 		case API::GL: decompiledShader += "#version 410 core\n"; break;
-		case API::GLES: decompiledShader += "#version 300 es\n"; break;
+		case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
 		default: break;
 	}

@ -109,7 +283,7 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += R"(
 			vec4 safe_mul(vec4 a, vec4 b) {
 				vec4 res = a * b;
-				return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
+				return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
 			}
 		)";
 	}
@ -121,17 +295,61 @@ std::string ShaderDecompiler::decompile() {

 	decompiledShader += "void pica_shader_main() {\n";
 	AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
-	callFunction(*findFunction(mainFunctionRange));
-	decompiledShader += "}\n";
+	auto mainFunc = findFunction(mainFunctionRange);

-	for (auto& func : controlFlow.functions) {
-		if (func.outLabels.size() > 0) {
-			Helpers::panic("Function with out labels");
+	decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
+
+	for (const Function& func : controlFlow.functions) {
+		if (func.outLabels.empty()) {
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+
+			auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
+			if (!finished) {
+				decompiledShader += "return false;";
+			}
+
+			decompiledShader += "}\n";
+		} else {
+			auto labels = func.outLabels;
+			labels.insert(func.start);
+
+			// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
+			// current PC
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+			decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
+			decompiledShader += "while(true){\nswitch(pc){\n";
+
+			for (u32 label : labels) {
+				decompiledShader += fmt::format("case {}u: {{", label);
+				// Fetch the next label whose address > label
+				auto it = labels.lower_bound(label + 1);
+				u32 next = (it == labels.end()) ? func.end : *it;
+
+				auto [endPC, finished] = compileRange(AddressRange(label, next));
+				if (endPC > next && !finished) {
+					labels.insert(endPC);
+					decompiledShader += fmt::format("pc = {}u; break;", endPC);
+				}
+
+				// Fallthrough to next label
+				decompiledShader += "}\n";
+			}
+
+			decompiledShader += "default: return false;\n";
+			// Exit the switch and loop
+			decompiledShader += "} }\n";
+
+			// Exit the function
+			decompiledShader += "return false;\n";
+			decompiledShader += "}\n";
 		}
+	}

-		decompiledShader += "void " + func.getIdentifier() + "() {\n";
-		compileRange(AddressRange(func.start, func.end));
-		decompiledShader += "}\n";
+	// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
+	// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
+	// and the renderer core will decide to use CPU shaders instead
+	if (compilationError) [[unlikely]] {
+		return "";
 	}

 	return decompiledShader;
@ -139,30 +357,41 @@ std::string ShaderDecompiler::decompile() {

 std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
 	if (source < 0x10) {
-		return "inputs[" + std::to_string(source) + "]";
+		return "attr" + std::to_string(source);
 	} else if (source < 0x20) {
-		return "temp_registers[" + std::to_string(source - 0x10) + "]";
+		return "temp[" + std::to_string(source - 0x10) + "]";
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;

-		if (floatIndex >= 96) [[unlikely]] {
-			return "dummy_vec";
+		if (index == 0) {
+			if (floatIndex >= 96) [[unlikely]] {
+				return "dummy_vec";
+			}
+			return "uniform_f[" + std::to_string(floatIndex) + "]";
+		} else {
+			static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
+			return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
 		}
-		return "uniform_float[" + std::to_string(floatIndex) + "]";
 	}
 }

 std::string ShaderDecompiler::getDest(u32 dest) const {
 	if (dest < 0x10) {
-		return "output_registers[" + std::to_string(dest) + "]";
+		return "out_regs[" + std::to_string(dest) + "]";
 	} else if (dest < 0x20) {
-		return "temp_registers[" + std::to_string(dest - 0x10) + "]";
+		return "temp[" + std::to_string(dest - 0x10) + "]";
 	} else {
 		return "dummy_vec";
 	}
 }

 std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
+	// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
+	static constexpr uint noSwizzle = 0x1B;
+	if (swizzle == noSwizzle) {
+		return "";
+	}
+
 	static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
 	std::string ret(".    ");
 	
@ -176,7 +405,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {

 std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
 	std::string ret = ".";
-	
 	if (destinationMask & 0b1000) {
 		ret += "x";
 	}
@ -208,11 +436,12 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 		return;
 	}

-	decompiledShader += dest + destSwizzle + " = ";
-	if (writtenLaneCount == 1) {
-		decompiledShader += "float(" + value + ");\n";
-	} else {
-		decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
+	// Don't write destination swizzle if all lanes are getting written to
+	decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
+	if (writtenLaneCount <= 3) {
+		decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
+	} else if (writtenLaneCount == 4) {
+		decompiledShader += fmt::format("{};\n", value);
 	}
 }

@ -246,26 +475,101 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {

 		std::string dest = getDest(destIndex);

-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
-		}
-
-		if (invertSources) {
-			Helpers::panic("GLSL recompiler: Inverted instruction");
-		}
-
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
-			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
-			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
-			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
-			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
+			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
+			case ShaderOpcodes::MUL:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
+				} else {
+					setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
+			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;

-			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
-			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
-			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
+			case ShaderOpcodes::DP3:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::DP4:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
+			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
+			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
+			case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
+			case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;

-			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
+			case ShaderOpcodes::SLT:
+			case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::SGE:
+			case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::DPH:
+			case ShaderOpcodes::DPHI:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
+				static constexpr std::array<const char*, 8> operators = {
+					// The last 2 operators always return true and are handled specially
+					"==", "!=", "<", "<=", ">", ">=", "", "",
+				};
+
+				const u32 cmpY = getBits<21, 3>(instruction);
+				const u32 cmpX = getBits<24, 3>(instruction);
+
+				// Compare x first
+				if (cmpX >= 6) {
+					decompiledShader += "cmp_reg.x = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
+				}
+
+				// Then compare Y
+				if (cmpY >= 6) {
+					decompiledShader += "cmp_reg.y = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
+				}
+				break;
+			}
+
+			case ShaderOpcodes::MOVA: {
+				const bool writeX = getBit<3>(operandDescriptor);  // Should we write the x component of the address register?
+				const bool writeY = getBit<2>(operandDescriptor);
+
+				if (writeX && writeY) {
+					decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
+				} else if (writeX) {
+					decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
+				} else if (writeY) {
+					decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
+				}
+				break;
+			}
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
 		const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
@ -299,23 +603,156 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		src3 += getSwizzlePattern(swizzle3);

 		std::string dest = getDest(destIndex);
-
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
+		if (!config.accurateShaderMul) {
+			setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
+		} else {
+			setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
 		}
-
-		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
 	} else {
 		switch (opcode) {
-			case ShaderOpcodes::END: finished = true; return;
-			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
+			case ShaderOpcodes::JMPC: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 condOp = getBits<22, 2>(instruction);
+				const uint refY = getBit<24>(instruction);
+				const uint refX = getBit<25>(instruction);
+				const char* condition = getCondition(condOp, refX, refY);
+
+				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
+				break;
+			}
+
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+				const u32 mask = 1u << bit;
+				const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we jump if bit = 1, otherwise 0
+
+				decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
+				break;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
+
+				if (opcode == ShaderOpcodes::IFC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*conditionalFunc);
+				decompiledShader += "}\n";
+
+				pc = dest;
+				if (num > 0) {
+					const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
+					pc = dest + num;
+
+					decompiledShader += "else { ";
+					callFunction(*elseFunc);
+					decompiledShader += "}\n";
+
+					if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
+						finished = true;
+						return;
+					}
+				}
+
+				return;
+			}
+
+			case ShaderOpcodes::CALL:
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
+
+				// Handle conditions for CALLC/CALLU
+				if (opcode == ShaderOpcodes::CALLC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else if (opcode == ShaderOpcodes::CALLU) {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*calledFunc);
+
+				// Close brackets for CALLC/CALLU
+				if (opcode != ShaderOpcodes::CALL) {
+					decompiledShader += "}";
+				}
+
+				if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 uniformIndex = getBits<22, 2>(instruction);
+
+				// loop counter = uniform.y
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format(
+					"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
+					"16u) & 0xFFu)) {{\n",
+					pc, pc, uniformIndex, pc, uniformIndex
+				);
+
+				AddressRange range(pc + 1, dest + 1);
+				const Function* func = findFunction(range);
+				callFunction(*func);
+				decompiledShader += "}\n";
+
+				// Jump to the end of the loop. We don't want to compile the code inside the loop again.
+				// This will be incremented by 1 due to the pc++ at the end of this loop.
+				pc = dest;
+
+				if (func->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::END:
+				decompiledShader += "return true;\n";
+				finished = true;
+				return;
+
+			case ShaderOpcodes::NOP: break;
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	}

 	pc++;
 }

-
 bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 	const u32 opcode = instruction >> 26;
 	switch (opcode) {
@ -339,16 +776,57 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 		case ShaderOpcodes::SLT:
 		case ShaderOpcodes::SLTI:
 		case ShaderOpcodes::SGE:
-		case ShaderOpcodes::SGEI: return true;
+		case ShaderOpcodes::SGEI:
+		case ShaderOpcodes::LITP: return true;

 		default: return false;
 	}
 }

-void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
+void ShaderDecompiler::callFunction(const Function& function) {
+	switch (function.exitMode) {
+		// This function always ends, so call it and return true to signal that we're gonna be ending the shader
+		case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
+		// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
+		case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
+		// This function will not end. Just call it like a normal function.
+		default: decompiledShader += function.getCallStatement() + ";\n"; break;
+	}
+}

 std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
 	ShaderDecompiler decompiler(shader, config, entrypoint, api, language);

 	return decompiler.decompile();
 }
+
+const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
+	static constexpr std::array<const char*, 16> conditions = {
+		// ref(Y, X) = (0, 0)
+		"!all(cmp_reg)",
+		"all(not(cmp_reg))",
+		"!cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (0, 1)
+		"cmp_reg.x || !cmp_reg.y",
+		"cmp_reg.x && !cmp_reg.y",
+		"cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (1, 0)
+		"!cmp_reg.x || cmp_reg.y",
+		"!cmp_reg.x && cmp_reg.y",
+		"!cmp_reg.x",
+		"cmp_reg.y",
+
+		// ref(Y, X) = (1, 1)
+		"any(cmp_reg)",
+		"all(cmp_reg)",
+		"cmp_reg.x",
+		"cmp_reg.y",
+	};
+	const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
+
+	return conditions[key];
+}
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@ -1,6 +1,14 @@
+#include <fmt/format.h>
+
+#include <utility>
+
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+
+// We can include the driver headers here since they shouldn't have any actual API-specific code
+#include "renderer_gl/gl_driver.hpp"
+
 using namespace PICA;
 using namespace PICA::ShaderGen;

@ -34,6 +42,8 @@ static constexpr const char* uniformDefinition = R"(

 std::string FragmentGenerator::getDefaultVertexShader() {
 	std::string ret = "";
+	// Reserve some space (128KB) in the output string to avoid too many allocations later
+	ret.reserve(128 * 1024);

 	switch (api) {
 		case API::GL: ret += "#version 410 core"; break;
@ -94,7 +104,7 @@ std::string FragmentGenerator::getDefaultVertexShader() {
 	return ret;
 }

-std::string FragmentGenerator::generate(const FragmentConfig& config) {
+std::string FragmentGenerator::generate(const FragmentConfig& config, void* driverInfo) {
 	std::string ret = "";

 	switch (api) {
@ -103,6 +113,27 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
 		default: break;
 	}

+	// For GLES we need to enable & use the framebuffer fetch extension in order to emulate logic ops
+	bool emitLogicOps = api == API::GLES && config.outConfig.logicOpMode != PICA::LogicOpMode::Copy && driverInfo != nullptr;
+
+	if (emitLogicOps) {
+		auto driver = static_cast<OpenGL::Driver*>(driverInfo);
+
+		// If the driver does not support framebuffer fetch at all, don't emit logic op code
+		if (!driver->supportFbFetch()) {
+			emitLogicOps = false;
+		}
+		
+		// Figure out which fb fetch extension we have and enable it
+		else {
+			if (driver->supportsExtFbFetch) {
+				ret += "\n#extension GL_EXT_shader_framebuffer_fetch : enable\n#define fb_color fragColor\n";
+			} else if (driver->supportsArmFbFetch) {
+				ret += "\n#extension GL_ARM_shader_framebuffer_fetch : enable\n#define fb_color gl_LastFragColorARM[0]\n";
+			}
+		}
+	}
+
 	bool unimplementedFlag = false;
 	if (api == API::GLES) {
 		ret += R"(
@ -192,10 +223,13 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
 	}

 	compileFog(ret, config);
-
 	applyAlphaTest(ret, config);

-	ret += "fragColor = combinerOutput;\n}"; // End of main function
+	if (!emitLogicOps) {
+		ret += "fragColor = combinerOutput;\n}";  // End of main function
+	} else {
+		compileLogicOps(ret, config);
+	}

 	return ret;
 }
@ -671,3 +705,135 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);";
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }
+
+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
+	// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
+	std::array<std::pair<int, int>, 32> outputMappings{};
+	// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
+	std::array<u8, 16> vsOutputRegisters;
+
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+
+		for (int i = 0; i < vertConfig.outputCount; i++) {
+			const u32 config = vertConfig.outmaps[i];
+			for (int j = 0; j < 4; j++) {
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
+			}
+		}
+	}
+
+	auto getSemanticName = [&](u32 semanticIndex) {
+		auto [reg, lane] = outputMappings[semanticIndex];
+		return fmt::format("out_regs[{}][{}]", reg, lane);
+	};
+
+	std::string semantics = fmt::format(
+		R"(
+	vec4 a_coords = vec4({}, {}, {}, {});
+	vec4 a_quaternion = vec4({}, {}, {}, {});
+	vec4 a_vertexColour = vec4({}, {}, {}, {});
+	vec2 a_texcoord0 = vec2({}, {});
+	float a_texcoord0_w = {};
+	vec2 a_texcoord1 = vec2({}, {});
+	vec2 a_texcoord2 = vec2({}, {});
+	vec3 a_view = vec3({}, {}, {});
+)",
+		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
+		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
+		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
+		getSemanticName(18), getSemanticName(19), getSemanticName(20)
+	);
+
+	if (usingUbershader) {
+		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
+		return picaSource;
+	} else {
+		// TODO: Uniforms and don't hardcode fixed-function semantic indices...
+		std::string ret = picaSource;
+		if (api == API::GLES) {
+			ret += "\n#define USING_GLES\n";
+		}
+
+		ret += uniformDefinition;
+
+		ret += R"(
+out vec4 v_quaternion;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+
+#ifndef USING_GLES
+	out float gl_ClipDistance[2];
+#endif
+
+void main() {
+	pica_shader_main();
+)";
+	// Transfer fixed function fragment registers from vertex shader output to the fragment shader
+	ret += semantics;
+	
+	ret += R"(
+	gl_Position = a_coords;
+	vec4 colourAbs = abs(a_vertexColour);
+	v_colour = min(colourAbs, vec4(1.f));
+
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+	v_quaternion = a_quaternion;
+
+#ifndef USING_GLES
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipCoords, a_coords);
+#endif
+})";
+		return ret;
+	}
+}
+
+void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
+	if (api != API::GLES) [[unlikely]] {
+		Helpers::warn("Shadergen: Unsupported API for compileLogicOps");
+		shader += "fragColor = combinerOutput;\n}"; // End of main function
+
+		return;
+	}
+	
+	shader += "fragColor = ";
+	switch (config.outConfig.logicOpMode) {
+		case PICA::LogicOpMode::Copy: shader += "combinerOutput"; break;
+		case PICA::LogicOpMode::Nop: shader += "fb_color"; break;
+		case PICA::LogicOpMode::Clear: shader += "vec4(0.0)"; break;
+		case PICA::LogicOpMode::Set: shader += "vec4(1.0)"; break;
+		case PICA::LogicOpMode::InvertedCopy: shader += "vec4(uvec4(combinerOutput * 255.0) ^ uvec4(0xFFu)) * (1.0 / 255.0)"; break;
+
+		default:
+			shader += "combinerOutput";
+			Helpers::warn("Shadergen: Unimplemented logic op mode");
+			break;
+	}
+
+	shader += ";\n}"; // End of main function
+}
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@ -34,4 +34,5 @@ void PICAShader::reset() {

 	codeHashDirty = true;
 	opdescHashDirty = true;
+	uniformsDirty = true;
 }
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@ -76,6 +76,7 @@ namespace Audio {
 			source.reset();
 		}

+		mixer.reset();
 		// Note: Reset audio pipe AFTER resetting all pipes, otherwise the new data will be yeeted
 		resetAudioPipe();
 	}
@ -250,6 +251,8 @@ namespace Audio {

 			source.isBufferIDDirty = false;
 		}
+
+		performMix(read, write);
 	}

 	void HLE_DSP::updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients) {
@ -465,6 +468,50 @@ namespace Audio {
 		}
 	}

+	void HLE_DSP::performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion) {
+		updateMixerConfig(readRegion);
+		// TODO: Do the actual audio mixing
+
+		auto& dspStatus = writeRegion.dspStatus;
+		// Stub the DSP status. It's unknown what the "unknown" field is but Citra sets it to 0, so we do too to be safe
+		dspStatus.droppedFrames = 0;
+		dspStatus.unknown = 0;
+	}
+
+	void HLE_DSP::updateMixerConfig(Audio::HLE::SharedMemory& sharedMem) {
+		auto& config = sharedMem.dspConfiguration;
+		// No configs have been changed, so there's nothing to update
+		if (config.dirtyRaw == 0) {
+			return;
+		}
+
+		if (config.outputFormatDirty) {
+			mixer.channelFormat = config.outputFormat;
+		}
+		
+		if (config.masterVolumeDirty) {
+			mixer.volumes[0] = config.masterVolume;
+		}
+
+		if (config.auxVolume0Dirty) {
+			mixer.volumes[1] = config.auxVolumes[0];
+		}
+		
+		if (config.auxVolume1Dirty) {
+			mixer.volumes[2] = config.auxVolumes[1];
+		}
+
+		if (config.auxBusEnable0Dirty) {
+			mixer.enableAuxStages[0] = config.auxBusEnable[0] != 0;
+		}
+
+		if (config.auxBusEnable1Dirty) {
+			mixer.enableAuxStages[1] = config.auxBusEnable[1] != 0;
+		}
+
+		config.dirtyRaw = 0;
+	}
+
 	HLE_DSP::SampleBuffer HLE_DSP::decodePCM8(const u8* data, usize sampleCount, Source& source) {
 		SampleBuffer decodedSamples(sampleCount);

@ -585,7 +632,7 @@ namespace Audio {
 		AAC::Message response;

 		switch (request.command) {
-			case AAC::Command::EncodeDecode:
+			case AAC::Command::EncodeDecode: {
 				// Dummy response to stop games from hanging
 				response.resultCode = AAC::ResultCode::Success;
 				response.decodeResponse.channelCount = 2;
@ -596,10 +643,13 @@ namespace Audio {
 				response.command = request.command;
 				response.mode = request.mode;

-				// We've already got an AAC decoder but it's currently disabled until mixing & output is properly implemented
-				// TODO: Uncomment this when the time comes
-				// aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
+				// TODO: Make this a toggle in config.toml. Currently we have it off by default until we finish the DSP mixer.
+				constexpr bool enableAAC = false;
+				if (enableAAC) {
+					aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
+				}
 				break;
+			}

 			case AAC::Command::Init:
 			case AAC::Command::Shutdown:
--- a/src/core/kernel/memory_management.cpp
+++ b/src/core/kernel/memory_management.cpp
@ -136,7 +136,7 @@ void Kernel::mapMemoryBlock() {
 				break;

 			case KernelHandles::FontSharedMemHandle:
-				mem.copySharedFont(ptr);
+				mem.copySharedFont(ptr, addr);
 				break;

 			case KernelHandles::CSNDSharedMemHandle:
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -7,6 +7,7 @@

 #include "config_mem.hpp"
 #include "resource_limits.hpp"
+#include "services/fonts.hpp"
 #include "services/ptm.hpp"

 CMRC_DECLARE(ConsoleFonts);
@ -51,7 +52,7 @@ void Memory::reset() {
 		if (e.handle == KernelHandles::FontSharedMemHandle) {
 			// Read font size from the cmrc filesystem the font is stored in
 			auto fonts = cmrc::ConsoleFonts::get_filesystem();
-			e.size = fonts.open("CitraSharedFontUSRelocated.bin").size();
+			e.size = fonts.open("SharedFontReplacement.bin").size();
 		}

 		e.mapped = false;
@ -520,10 +521,13 @@ Regions Memory::getConsoleRegion() {
 	return region;
 }

-void Memory::copySharedFont(u8* pointer) {
+void Memory::copySharedFont(u8* pointer, u32 vaddr) {
 	auto fonts = cmrc::ConsoleFonts::get_filesystem();
-	auto font = fonts.open("CitraSharedFontUSRelocated.bin");
+	auto font = fonts.open("SharedFontReplacement.bin");
 	std::memcpy(pointer, font.begin(), font.size());
+
+	// Relocate shared font to the address it's being loaded to
+	HLE::Fonts::relocateSharedFont(pointer, vaddr);
 }

 std::optional<u64> Memory::getProgramID() {
--- a/src/core/renderer_gl/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
 }

 void GLStateManager::resetBuffers() {
-	boundVBO = 0;
 	boundUBO = 0;
-
-	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 }

--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -2,13 +2,16 @@

 #include <stb_image_write.h>

+#include <bit>
 #include <cmrc/cmrc.hpp>

-#include "config.hpp"
 #include "PICA/float_types.hpp"
-#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
+#include "PICA/pica_frag_uniforms.hpp"
+#include "PICA/pica_simd.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader_decompiler.hpp"
+#include "config.hpp"
 #include "math_util.hpp"

 CMRC_DECLARE(RendererGL);
@ -24,7 +27,7 @@ void RendererGL::reset() {
 	colourBufferCache.reset();
 	textureCache.reset();

-	clearShaderCache();
+	shaderCache.clear();

 	// Init the colour/depth buffer settings to some random defaults on reset
 	colourBufferLoc = 0;
@ -77,40 +80,56 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.useProgram(displayProgram);
 	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object

+	// Create stream buffers for vertex, index and uniform buffers
+	static constexpr usize hwIndexBufferSize = 2_MB;
+	static constexpr usize hwVertexBufferSize = 16_MB;
+
+	hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
+	hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
+
 	// Allocate memory for the shadergen fragment uniform UBO
 	glGenBuffers(1, &shadergenFragmentUBO);
 	gl.bindUBO(shadergenFragmentUBO);
 	glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);

-	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
-	gl.bindVBO(vbo);
-	vao.create();
-	gl.bindVAO(vao);
+	// Allocate memory for the accelerated vertex shader uniform UBO
+	glGenBuffers(1, &hwShaderUniformUBO);
+	gl.bindUBO(hwShaderUniformUBO);
+	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
+
+	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
+	vbo.bind();
+	// Initialize the VAO used when not using hw shaders
+	defaultVAO.create();
+	gl.bindVAO(defaultVAO);

 	// Position (x, y, z, w) attributes
-	vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
-	vao.enableAttribute(0);
+	defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
+	defaultVAO.enableAttribute(0);
 	// Quaternion attribute
-	vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
-	vao.enableAttribute(1);
+	defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
+	defaultVAO.enableAttribute(1);
 	// Colour attribute
-	vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
-	vao.enableAttribute(2);
+	defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
+	defaultVAO.enableAttribute(2);
 	// UV 0 attribute
-	vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
-	vao.enableAttribute(3);
+	defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
+	defaultVAO.enableAttribute(3);
 	// UV 1 attribute
-	vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
-	vao.enableAttribute(4);
+	defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
+	defaultVAO.enableAttribute(4);
 	// UV 0 W-component attribute
-	vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
-	vao.enableAttribute(5);
+	defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
+	defaultVAO.enableAttribute(5);
 	// View
-	vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
-	vao.enableAttribute(6);
+	defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
+	defaultVAO.enableAttribute(6);
 	// UV 2 attribute
-	vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
-	vao.enableAttribute(7);
+	defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
+	defaultVAO.enableAttribute(7);
+
+	// Initialize the VAO used for hw shaders
+	hwShaderVAO.create();

 	dummyVBO.create();
 	dummyVAO.create();
@ -165,8 +184,18 @@ void RendererGL::initGraphicsContextInternal() {
 	OpenGL::clearColor();
 	OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);

+	// Initialize fixed attributes
+	for (int i = 0; i < fixedAttrValues.size(); i++) {
+		fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
+		glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
+	}
+
 	reset();

+	// Populate our driver info structure
+	driverInfo.supportsExtFbFetch = (GLAD_GL_EXT_shader_framebuffer_fetch != 0);
+	driverInfo.supportsArmFbFetch = (GLAD_GL_ARM_shader_framebuffer_fetch != 0);
+
 	// Initialize the default vertex shader used with shadergen
 	std::string defaultShadergenVSSource = fragShaderGen.getDefaultVertexShader();
 	defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
@ -414,29 +443,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};

-	bool usingUbershader = enableUbershader;
-	if (usingUbershader) {
-		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
-		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
-
-		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
-		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
-		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
-			usingUbershader = false;
-		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
-	} else {
-		OpenGL::Program& program = getSpecializedShader();
-		gl.useProgram(program);
-	}
-
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	gl.bindVBO(vbo);
-	gl.bindVAO(vao);
+
+	// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
+	if (!usingAcceleratedShader) {
+		vbo.bind();
+		gl.bindVAO(defaultVAO);
+	}

 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@ -454,38 +468,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const int depthFunc = getBits<4, 3>(depthControl);
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
-
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};

-	// Update ubershader uniforms
-	if (usingUbershader) {
-		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
-		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
-		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
-
-		if (oldDepthScale != depthScale) {
-			oldDepthScale = depthScale;
-			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
-		}
-
-		if (oldDepthOffset != depthOffset) {
-			oldDepthOffset = depthOffset;
-			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
-		}
-
-		if (oldDepthmapEnable != depthMapEnable) {
-			oldDepthmapEnable = depthMapEnable;
-			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
-		}
-
-		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
-		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
-		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
-		setupUbershaderTexEnv();
-	}
-
 	bindTexturesToSlots();
-
 	if (gpu.fogLUTDirty) {
 		updateFogLUT();
 	}
@ -528,8 +513,32 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v

 	setupStencilTest(stencilEnable);

-	vbo.bufferVertsSub(vertices);
-	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	if (!usingAcceleratedShader) {
+		vbo.bufferVertsSub(vertices);
+		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	} else {
+		if (performIndexedRender) {
+			// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
+			hwIndexBuffer->Bind();
+
+			if (glDrawRangeElementsBaseVertex != nullptr) [[likely]] {
+				glDrawRangeElementsBaseVertex(
+					primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
+					hwIndexBufferOffset, -GLint(minimumIndex)
+				);
+			} else {
+				// If glDrawRangeElementsBaseVertex is not available then prepareForDraw will have subtracted the base vertex from the index buffer
+				// for us, so just use glDrawRangeElements
+				glDrawRangeElements(
+					primitiveTopology, 0, GLint(maximumIndex - minimumIndex), GLsizei(vertices.size()),
+					usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, hwIndexBufferOffset
+				);
+			}
+		} else {
+			// When doing non-indexed rendering, just use glDrawArrays
+			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+		}
+	}
 }

 void RendererGL::display() {
@ -836,34 +845,53 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
 }

 OpenGL::Program& RendererGL::getSpecializedShader() {
-	constexpr uint uboBlockBinding = 2;
+	constexpr uint vsUBOBlockBinding = 1;
+	constexpr uint fsUBOBlockBinding = 2;

 	PICA::FragmentConfig fsConfig(regs);
+	// If we're not on GLES, ignore the logic op configuration and don't generate redundant shaders for it, since we use hw logic ops
+#ifndef USING_GLES
+	fsConfig.outConfig.logicOpMode = PICA::LogicOpMode(0);
+#endif

-	CachedProgram& programEntry = shaderCache[fsConfig];
+	OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
+	if (!fragShader.exists()) {
+		std::string fs = fragShaderGen.generate(fsConfig);
+		fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
+	}
+
+	// Get the handle of the current vertex shader
+	OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
+	// And form the key for looking up a shader program
+	const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
+
+	CachedProgram& programEntry = shaderCache.programCache[programKey];
 	OpenGL::Program& program = programEntry.program;

 	if (!program.exists()) {
-		std::string fs = fragShaderGen.generate(fsConfig);
-
-		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
-		program.create({defaultShadergenVs, fragShader});
+		program.create({vertexShader, fragShader});
 		gl.useProgram(program);

-		fragShader.free();
-
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);

-		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
+		// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
-		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
-		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
+		glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
+
+		if (usingAcceleratedShader) {
+			uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
+			glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
+		}
+	}
+	glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
+	if (usingAcceleratedShader) {
+		glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
 	}
-	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);

 	// Upload uniform data to our shader's UBO
 	PICA::FragmentUniforms uniforms;
@ -953,6 +981,101 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }

+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	// First we figure out if we will be using an ubershader
+	bool usingUbershader = emulatorConfig->useUbershaders;
+	if (usingUbershader) {
+		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
+		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
+
+		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
+		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
+		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+			usingUbershader = false;
+		}
+	}
+
+	// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
+	// TODO: Ubershader support for accelerated shaders
+	usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
+
+	if (usingAcceleratedShader) {
+		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
+
+		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
+		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
+		if (!shader.has_value()) {
+			// Initialize shader to a "null" shader (handle == 0)
+			shader = OpenGL::Shader();
+
+			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
+				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
+				Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+			);
+
+			// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
+			// it to the GPU
+			if (!picaShaderSource.empty()) {
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
+				shader->create({vertexShaderSource}, OpenGL::Vertex);
+			}
+		}
+
+		// Shader generation did not work out, so set usingAcceleratedShader to false
+		if (!shader->exists()) {
+			usingAcceleratedShader = false;
+		} else {
+			generatedVertexShader = &(*shader);
+			gl.bindUBO(hwShaderUniformUBO);
+
+			if (shaderUnit.vs.uniformsDirty) {
+				shaderUnit.vs.uniformsDirty = false;
+				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
+			}
+
+			performIndexedRender = accel->indexed;
+			minimumIndex = GLsizei(accel->minimumIndex);
+			maximumIndex = GLsizei(accel->maximumIndex);
+
+			// Upload vertex data and index buffer data to our GPU
+			accelerateVertexUpload(shaderUnit, accel);
+		}
+	}
+
+	if (!usingUbershader) {
+		OpenGL::Program& program = getSpecializedShader();
+		gl.useProgram(program);
+	} else { // Bind ubershader & load ubershader uniforms
+		gl.useProgram(triangleProgram);
+
+		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
+		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
+		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
+
+		if (oldDepthScale != depthScale) {
+			oldDepthScale = depthScale;
+			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
+		}
+
+		if (oldDepthOffset != depthOffset) {
+			oldDepthOffset = depthOffset;
+			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
+		}
+
+		if (oldDepthmapEnable != depthMapEnable) {
+			oldDepthmapEnable = depthMapEnable;
+			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
+		}
+
+		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
+		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
+		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
+		setupUbershaderTexEnv();
+	}
+
+	return usingAcceleratedShader;
+}
+
 void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;
@ -966,7 +1089,7 @@ void RendererGL::screenshot(const std::string& name) {

 	// Flip the image vertically
 	for (int y = 0; y < height; y++) {
-		memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
+		std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
 		// Swap R and B channels
 		for (int x = 0; x < width; x++) {
 			std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
@ -978,21 +1101,12 @@ void RendererGL::screenshot(const std::string& name) {
 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
 }

-void RendererGL::clearShaderCache() {
-	for (auto& shader : shaderCache) {
-		CachedProgram& cachedProgram = shader.second;
-		cachedProgram.program.free();
-	}
-
-	shaderCache.clear();
-}
-
 void RendererGL::deinitGraphicsContext() {
 	// Invalidate all surface caches since they'll no longer be valid
 	textureCache.reset();
 	depthBufferCache.reset();
 	colourBufferCache.reset();
-	clearShaderCache();
+	shaderCache.clear();

 	// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
 	// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@ -1041,3 +1155,99 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
 }
+
+void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	u32 buffer = 0;  // Vertex buffer index for non-fixed attributes
+	u32 attrCount = 0;
+
+	const u32 totalAttribCount = accel->totalAttribCount;
+
+	static constexpr GLenum attributeFormats[4] = {
+		GL_BYTE,           // 0: Signed byte
+		GL_UNSIGNED_BYTE,  // 1: Unsigned byte
+		GL_SHORT,          // 2: Short
+		GL_FLOAT,          // 3: Float
+	};
+
+	const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
+
+	// Update index buffer if necessary
+	if (accel->indexed) {
+		usingShortIndices = accel->useShortIndices;
+		const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
+
+		hwIndexBuffer->Bind();
+		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
+		hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
+
+		std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
+		// If we don't have glDrawRangeElementsBaseVertex, we must subtract the base index value from our index buffer manually
+		if (glDrawRangeElementsBaseVertex == nullptr) [[unlikely]] {
+			const u32 indexCount = regs[PICA::InternalRegs::VertexCountReg];
+			usingShortIndices ? PICA::IndexBuffer::subtractBaseIndex<true>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex)
+							  : PICA::IndexBuffer::subtractBaseIndex<false>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex);
+		}
+
+		hwIndexBuffer->Unmap(indexBufferSize);
+	}
+
+	hwVertexBuffer->Bind();
+	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
+
+	gl.bindVAO(hwShaderVAO);
+
+	// Enable or disable vertex attributes as needed
+	const u32 currentAttributeMask = accel->enabledAttributeMask;
+	// Use bitwise xor to calculate which attributes changed
+	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
+	
+	while (attributeMaskDiff != 0) {
+		// Get index of next different attribute and turn it off
+		const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
+		const u32 mask = 1u << index;
+		attributeMaskDiff ^= mask;
+
+		if ((currentAttributeMask & mask) != 0) {
+			// Attribute was disabled and is now enabled
+			hwShaderVAO.enableAttribute(index);
+		} else {
+			// Attribute was enabled and is now disabled
+			hwShaderVAO.disableAttribute(index);
+		}
+	}
+
+	previousAttributeMask = currentAttributeMask;
+
+	// Upload the data for each (enabled) attribute loader into our vertex buffer
+	for (int i = 0; i < accel->totalLoaderCount; i++) {
+		auto& loader = accel->loaders[i];
+
+		std::memcpy(vertexData, loader.data, loader.size);
+		vertexData += loader.size;
+	}
+
+	hwVertexBuffer->Unmap(accel->vertexDataSize);
+
+	// Iterate over the 16 PICA input registers and configure how they should be fetched.
+	for (int i = 0; i < 16; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		const u32 attributeMask = 1u << i;
+
+		if (accel->fixedAttributes & attributeMask) {
+			auto& attrValue = fixedAttrValues[i];
+			// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
+			if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
+				attrValue[3] != attrib.fixedValue[3]) {
+				std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
+				glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			}
+		} else if (accel->enabledAttributeMask & attributeMask) {
+			glVertexAttribPointer(
+				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
+			);
+		}
+	}
+}
--- a/src/core/services/fonts.cpp
+++ b/src/core/services/fonts.cpp
@ -0,0 +1,109 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.cpp
+
+#include "services/fonts.hpp"
+
+#include <cstring>
+
+namespace HLE::Fonts {
+	void relocateSharedFont(u8* sharedFont, u32 newAddress) {
+		constexpr u32 sharedFontStartOffset = 0x80;
+		const u8* cfntData = &sharedFont[sharedFontStartOffset];
+
+		CFNT cfnt;
+		std::memcpy(&cfnt, cfntData, sizeof(cfnt));
+
+		u32 assumedCmapOffset = 0;
+		u32 assumedCwdhOffset = 0;
+		u32 assumedTglpOffset = 0;
+		u32 firstCmapOffset = 0;
+		u32 firstCwdhOffset = 0;
+		u32 firstTglpOffset = 0;
+
+		// First discover the location of sections so that the rebase offset can be auto-detected
+		u32 currentOffset = sharedFontStartOffset + cfnt.headerSize;
+		for (uint block = 0; block < cfnt.numBlocks; ++block) {
+			const u8* data = &sharedFont[currentOffset];
+
+			SectionHeader sectionHeader;
+			std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
+
+			if (firstCmapOffset == 0 && std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
+				firstCmapOffset = currentOffset;
+			} else if (firstCwdhOffset == 0 && std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
+				firstCwdhOffset = currentOffset;
+			} else if (firstTglpOffset == 0 && std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
+				firstTglpOffset = currentOffset;
+			} else if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
+				Fonts::FINF finf;
+				std::memcpy(&finf, data, sizeof(finf));
+
+				assumedCmapOffset = finf.cmapOffset - sizeof(SectionHeader);
+				assumedCwdhOffset = finf.cwdhOffset - sizeof(SectionHeader);
+				assumedTglpOffset = finf.tglpOffset - sizeof(SectionHeader);
+			}
+
+			currentOffset += sectionHeader.sectionSize;
+		}
+
+		u32 previousBase = assumedCmapOffset - firstCmapOffset;
+		if ((previousBase != assumedCwdhOffset - firstCwdhOffset) || (previousBase != assumedTglpOffset - firstTglpOffset)) {
+			Helpers::warn("You shouldn't be seeing this. Shared Font file offsets might be borked?");
+		}
+
+		u32 offset = newAddress - previousBase;
+
+		// Reset pointer back to start of sections and do the actual rebase
+		currentOffset = sharedFontStartOffset + cfnt.headerSize;
+		for (uint block = 0; block < cfnt.numBlocks; ++block) {
+			u8* data = &sharedFont[currentOffset];
+
+			SectionHeader sectionHeader;
+			std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
+
+			if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
+				Fonts::FINF finf;
+				std::memcpy(&finf, data, sizeof(finf));
+
+				// Relocate the offsets in the FINF section
+				finf.cmapOffset += offset;
+				finf.cwdhOffset += offset;
+				finf.tglpOffset += offset;
+
+				std::memcpy(data, &finf, sizeof(finf));
+			} else if (std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
+				Fonts::CMAP cmap;
+				std::memcpy(&cmap, data, sizeof(cmap));
+
+				// Relocate the offsets in the CMAP section
+				if (cmap.nextCmapOffset != 0) {
+					cmap.nextCmapOffset += offset;
+				}
+
+				std::memcpy(data, &cmap, sizeof(cmap));
+			} else if (std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
+				Fonts::CWDH cwdh;
+				std::memcpy(&cwdh, data, sizeof(cwdh));
+
+				// Relocate the offsets in the CWDH section
+				if (cwdh.nextCwdhOffset != 0) {
+					cwdh.nextCwdhOffset += offset;
+				}
+
+				std::memcpy(data, &cwdh, sizeof(cwdh));
+			} else if (std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
+				Fonts::TGLP tglp;
+				std::memcpy(&tglp, data, sizeof(tglp));
+
+				// Relocate the offsets in the TGLP section
+				tglp.sheetDataOffset += offset;
+				std::memcpy(data, &tglp, sizeof(tglp));
+			}
+
+			currentOffset += sectionHeader.sectionSize;
+		}
+	}
+}  // namespace HLE::Fonts
--- a/src/core/services/fonts/CitraSharedFontUSRelocated.bin
+++ b/src/core/services/fonts/CitraSharedFontUSRelocated.bin
--- a/src/jni_driver.cpp
+++ b/src/jni_driver.cpp
@ -8,6 +8,7 @@
 #include "renderer_gl/renderer_gl.hpp"
 #include "services/hid.hpp"
 #include "android_utils.hpp"
+#include "sdl_sensors.hpp"

 std::unique_ptr<Emulator> emulator = nullptr;
 HIDService* hidService = nullptr;
@ -43,6 +44,7 @@ extern "C" {
 AlberFunction(void, functionName) (JNIEnv* env, jobject obj, type value) { emulator->getConfig().settingName = value; }

 MAKE_SETTING(setShaderJitEnabled, jboolean, shaderJitEnabled)
+MAKE_SETTING(setAccurateShaderMulEnable, jboolean, accurateShaderMul)

 #undef MAKE_SETTING

@ -87,6 +89,7 @@ AlberFunction(void, Finalize)(JNIEnv* env, jobject obj) {
 	emulator = nullptr;
 	hidService = nullptr;
 	renderer = nullptr;
+	romLoaded = false;
 }

 AlberFunction(jboolean, HasRomLoaded)(JNIEnv* env, jobject obj) { return romLoaded; }
@ -110,6 +113,19 @@ AlberFunction(void, TouchScreenUp)(JNIEnv* env, jobject obj) { hidService->relea
 AlberFunction(void, KeyUp)(JNIEnv* env, jobject obj, jint keyCode) { hidService->releaseKey((u32)keyCode); }
 AlberFunction(void, KeyDown)(JNIEnv* env, jobject obj, jint keyCode) { hidService->pressKey((u32)keyCode); }

+AlberFunction(void, SetGyro)(JNIEnv* env, jobject obj, jfloat roll, jfloat pitch, jfloat yaw) {
+    auto rotation = Sensors::SDL::convertRotation({ float(roll), float(pitch), float(yaw) });
+    hidService->setPitch(s16(rotation.x));
+    hidService->setRoll(s16(rotation.y));
+    hidService->setYaw(s16(rotation.z));
+}
+
+AlberFunction(void, SetAccel)(JNIEnv* env, jobject obj, jfloat rawX, jfloat rawY, jfloat rawZ) {
+    float data[3] = { float(rawX), float(rawY), float(rawZ) };
+    auto accel = Sensors::SDL::convertAcceleration(data);
+    hidService->setAccel(accel.x, accel.y, accel.z);
+}
+
 AlberFunction(void, SetCirclepadAxis)(JNIEnv* env, jobject obj, jint x, jint y) {
 	hidService->setCirclepadX((s16)x);
 	hidService->setCirclepadY((s16)y);
@ -139,4 +155,4 @@ int AndroidUtils::openDocument(const char* path, const char* perms) {
    env->DeleteLocalRef(jmode);

    return (int)result;
-}
+}
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@ -163,13 +163,14 @@ static int fetchVariableRange(std::string key, int min, int max) {

 static void configInit() {
 	static const retro_variable values[] = {
-		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled"
-																	  : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled" : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_accelerate_shaders",
+		 EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
 		{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
 		{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
 																	  : "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
 		{"panda3ds_use_vsync", "Enable VSync; enabled|disabled"},
-		{"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"},
+		{"panda3ds_dsp_emulation", "DSP emulation; HLE|LLE|Null"},
 		{"panda3ds_use_audio", "Enable audio; disabled|enabled"},
 		{"panda3ds_use_virtual_sd", "Enable virtual SD card; enabled|disabled"},
 		{"panda3ds_write_protect_virtual_sd", "Write protect virtual SD card; disabled|enabled"},
@ -197,6 +198,8 @@ static void configUpdate() {
 	config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
 	config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
+	config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
+
 	config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);
 	config.discordRpcEnabled = false;
--- a/src/lua.cpp
+++ b/src/lua.cpp
@ -130,6 +130,32 @@ MAKE_MEMORY_FUNCTIONS(32)
 MAKE_MEMORY_FUNCTIONS(64)
 #undef MAKE_MEMORY_FUNCTIONS

+static int readFloatThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	lua_pushnumber(L, (lua_Number)Helpers::bit_cast<float, u32>(LuaManager::g_emulator->getMemory().read32(vaddr)));
+	return 1;
+}
+
+static int writeFloatThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	const float value = (float)lua_tonumber(L, 2);
+	LuaManager::g_emulator->getMemory().write32(vaddr, Helpers::bit_cast<u32, float>(value));
+	return 0;
+}
+
+static int readDoubleThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	lua_pushnumber(L, (lua_Number)Helpers::bit_cast<double, u64>(LuaManager::g_emulator->getMemory().read64(vaddr)));
+	return 1;
+}
+
+static int writeDoubleThunk(lua_State* L) {
+	const u32 vaddr = (u32)lua_tonumber(L, 1);
+	const double value = (double)lua_tonumber(L, 2);
+	LuaManager::g_emulator->getMemory().write64(vaddr, Helpers::bit_cast<u64, double>(value));
+	return 0;
+}
+
 static int getAppIDThunk(lua_State* L) {
 	std::optional<u64> id = LuaManager::g_emulator->getMemory().getProgramID();
 	
@ -248,10 +274,14 @@ static constexpr luaL_Reg functions[] = {
 	{ "__read16", read16Thunk },
 	{ "__read32", read32Thunk },
 	{ "__read64", read64Thunk },
+	{ "__readFloat", readFloatThunk },
+	{ "__readDouble", readDoubleThunk },
 	{ "__write8", write8Thunk} ,
 	{ "__write16", write16Thunk },
 	{ "__write32", write32Thunk },
 	{ "__write64", write64Thunk },
+	{ "__writeFloat", writeFloatThunk },
+	{ "__writeDouble", writeDoubleThunk },
 	{ "__getAppID", getAppIDThunk },
 	{ "__pause", pauseThunk }, 
 	{ "__resume", resumeThunk },
@ -273,10 +303,15 @@ void LuaManager::initializeThunks() {
 		read16 = function(addr) return GLOBALS.__read16(addr) end,
 		read32 = function(addr) return GLOBALS.__read32(addr) end,
 		read64 = function(addr) return GLOBALS.__read64(addr) end,
+		readFloat = function(addr) return GLOBALS.__readFloat(addr) end,
+		readDouble = function(addr) return GLOBALS.__readDouble(addr) end,
+
 		write8 = function(addr, value) GLOBALS.__write8(addr, value) end,
 		write16 = function(addr, value) GLOBALS.__write16(addr, value) end,
 		write32 = function(addr, value) GLOBALS.__write32(addr, value) end,
 		write64 = function(addr, value) GLOBALS.__write64(addr, value) end,
+		writeFloat = function(addr, value) GLOBALS.__writeFloat(addr, value) end,
+		writeDouble = function(addr, value) GLOBALS.__writeDouble(addr, value) end,

 		getAppID = function()
 			local ffi = require("ffi")
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/AlberDriver.java
@ -24,13 +24,16 @@ public class AlberDriver {
 	public static native void KeyUp(int code);
 	public static native void SetCirclepadAxis(int x, int y);
 	public static native void TouchScreenUp();
-	public static native void TouchScreenDown(int x, int y);
+	public static native void TouchScreenDown(int x, int y);;
+	public static native void SetGyro(float roll, float pitch, float yaw);
+	public static native void SetAccel(float x, float y, float z);
 	public static native void Pause();
 	public static native void Resume();
 	public static native void LoadLuaScript(String script);
 	public static native byte[] GetSmdh();

 	public static native void setShaderJitEnabled(boolean enable);
+	public static native void setAccurateShaderMulEnable(boolean enable);

 	public static int openDocument(String path, String mode) {
 		try {
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/GameActivity.java
@ -3,11 +3,22 @@ package com.panda3ds.pandroid.app;
 import android.app.ActivityManager;
 import android.app.PictureInPictureParams;
 import android.content.Intent;
+import android.content.res.Configuration;
+import android.hardware.Sensor;
+import android.hardware.SensorEvent;
+import android.hardware.SensorEventListener;
+import android.hardware.SensorManager;
+import android.opengl.Matrix;
 import android.os.Build;
 import android.os.Bundle;
+import android.renderscript.Matrix3f;
+import android.renderscript.Matrix4f;
+import android.util.Log;
 import android.util.Rational;
+import android.view.Display;
 import android.view.KeyEvent;
 import android.view.MotionEvent;
+import android.view.Surface;
 import android.view.View;
 import android.view.ViewGroup;
 import android.view.WindowManager;
@ -25,6 +36,7 @@ import com.panda3ds.pandroid.app.game.EmulatorCallback;
 import com.panda3ds.pandroid.data.config.GlobalConfig;
 import com.panda3ds.pandroid.input.InputHandler;
 import com.panda3ds.pandroid.input.InputMap;
+import com.panda3ds.pandroid.math.Vector3;
 import com.panda3ds.pandroid.utils.Constants;
 import com.panda3ds.pandroid.view.PandaGlSurfaceView;
 import com.panda3ds.pandroid.view.PandaLayoutController;
@ -32,7 +44,7 @@ import com.panda3ds.pandroid.view.ds.DsLayoutManager;
 import com.panda3ds.pandroid.view.renderer.ConsoleRenderer;
 import com.panda3ds.pandroid.view.utils.PerformanceView;

-public class GameActivity extends BaseActivity implements EmulatorCallback {
+public class GameActivity extends BaseActivity implements EmulatorCallback, SensorEventListener {
 	private final DrawerFragment drawerFragment = new DrawerFragment();
 	private final AlberInputListener inputListener = new AlberInputListener(this);
 	private ConsoleRenderer renderer;
@ -74,6 +86,19 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 			((FrameLayout) findViewById(R.id.panda_gl_frame)).addView(view, new FrameLayout.LayoutParams(ViewGroup.LayoutParams.WRAP_CONTENT, ViewGroup.LayoutParams.WRAP_CONTENT));
 		}
 		swapScreens(GlobalConfig.get(GlobalConfig.KEY_CURRENT_DS_LAYOUT));
+		registerSensors();
+	}
+
+	private void registerSensors() {
+		SensorManager sensorManager = (SensorManager) getSystemService(SENSOR_SERVICE);
+		Sensor accel = sensorManager.getDefaultSensor(Sensor.TYPE_ACCELEROMETER);
+		if (accel != null) {
+			sensorManager.registerListener(this, accel, 1);
+		}
+		Sensor gryro = sensorManager.getDefaultSensor(Sensor.TYPE_GYROSCOPE);
+		if (gryro != null) {
+			sensorManager.registerListener(this, gryro, 1);
+		}
 	}

 	private void changeOverlayVisibility(boolean visible) {
@ -85,7 +110,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 	@Override
 	protected void onResume() {
 		super.onResume();
-                getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+		getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
 		getWindow().getDecorView().setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN | View.SYSTEM_UI_FLAG_HIDE_NAVIGATION);
 		getWindow().addFlags(WindowManager.LayoutParams.FLAG_FULLSCREEN);
 		InputHandler.reset();
@ -94,6 +119,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 		if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O_MR1) {
 			getTheme().applyStyle(R.style.GameActivityNavigationBar, true);
 		}
+		registerSensors();
 	}

 	private void enablePIP() {
@ -113,6 +139,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
 	protected void onPause() {
 		super.onPause();

+		((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
 		InputHandler.reset();
 		if (GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE)) {
 			if (Build.VERSION.SDK_INT > Build.VERSION_CODES.O) {
@ -174,10 +201,48 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {

 	@Override
 	protected void onDestroy() {
+		((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
 		if (AlberDriver.HasRomLoaded()) {
 			AlberDriver.Finalize();
 		}

 		super.onDestroy();
 	}
+
+	private float getDeviceRotationAngle() {
+		if (getWindow().getDecorView() == null || getWindow().getDecorView().getDisplay() == null)
+			return 0.0f;
+
+		int rotation = getWindow().getDecorView().getDisplay().getRotation();
+		switch (rotation) {
+			case Surface.ROTATION_90: return 90.0f;
+			case Surface.ROTATION_180: return 180.0f;
+			case Surface.ROTATION_270: return -90.0f;
+			default: return 0.0f;
+		}
+	}
+
+	@Override
+	public void onSensorChanged(SensorEvent event) {
+		if (AlberDriver.HasRomLoaded()) {
+			Sensor sensor = event.sensor;
+			switch (sensor.getType()) {
+				case Sensor.TYPE_ACCELEROMETER: {
+					float[] values = event.values;
+					Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
+					vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
+					AlberDriver.SetAccel(vec3.x, vec3.y, vec3.z);
+				} break;
+				case Sensor.TYPE_GYROSCOPE: {
+					float[] values = event.values;
+					Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
+					vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
+					AlberDriver.SetGyro(vec3.x, vec3.y, vec3.z);
+				} break;
+			}
+		}
+	}
+
+	@Override
+	public void onAccuracyChanged(Sensor sensor, int accuracy) {}
 }
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/base/BasePreferenceFragment.java
@ -26,6 +26,10 @@ public abstract class BasePreferenceFragment extends PreferenceFragmentCompat {
 		((SwitchPreferenceCompat)findPreference(id)).setChecked(value);
 	}

+	protected void setSummaryValue(String id,String text) {
+		findPreference(id).setSummary(text);
+	}
+
 	protected void setActivityTitle(@StringRes int titleId) {
 		ActionBar header = ((AppCompatActivity) requireActivity()).getSupportActionBar();
 		if (header != null) {
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/AdvancedPreferences.java
@ -22,6 +22,7 @@ public class AdvancedPreferences extends BasePreferenceFragment {

        setItemClick("performanceMonitor", pref -> GlobalConfig.set(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY, ((SwitchPreferenceCompat) pref).isChecked()));
        setItemClick("shaderJit", pref -> GlobalConfig.set(GlobalConfig.KEY_SHADER_JIT, ((SwitchPreferenceCompat) pref).isChecked()));
+        setItemClick("accurateShaderMul", pref -> GlobalConfig.set(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY, ((SwitchPreferenceCompat) pref).isChecked()));
        setItemClick("loggerService", pref -> {
            boolean checked = ((SwitchPreferenceCompat) pref).isChecked();
            Context ctx = PandroidApplication.getAppContext();
@ -46,5 +47,6 @@ public class AdvancedPreferences extends BasePreferenceFragment {
        ((SwitchPreferenceCompat) findPreference("performanceMonitor")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY));
        ((SwitchPreferenceCompat) findPreference("loggerService")).setChecked(GlobalConfig.get(GlobalConfig.KEY_LOGGER_SERVICE));
        ((SwitchPreferenceCompat) findPreference("shaderJit")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
+        ((SwitchPreferenceCompat) findPreference("accurateShaderMul")).setChecked(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));
    }
 }
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/GeneralPreferences.java
@ -1,7 +1,13 @@
 package com.panda3ds.pandroid.app.preferences;

+import android.net.Uri;
 import android.os.Bundle;
+import android.util.Log;
+import android.widget.Toast;

+import androidx.activity.result.ActivityResultCallback;
+import androidx.activity.result.ActivityResultLauncher;
+import androidx.activity.result.contract.ActivityResultContracts;
 import androidx.annotation.Nullable;
 import androidx.preference.SwitchPreferenceCompat;

@ -10,8 +16,11 @@ import com.panda3ds.pandroid.app.PreferenceActivity;
 import com.panda3ds.pandroid.app.base.BasePreferenceFragment;
 import com.panda3ds.pandroid.app.preferences.screen_editor.ScreenLayoutsPreference;
 import com.panda3ds.pandroid.data.config.GlobalConfig;
+import com.panda3ds.pandroid.utils.FileUtils;

-public class GeneralPreferences extends BasePreferenceFragment {
+public class GeneralPreferences extends BasePreferenceFragment implements ActivityResultCallback<Uri> {
+    private final ActivityResultContracts.OpenDocument openFolderContract = new ActivityResultContracts.OpenDocument();
+    private ActivityResultLauncher<String[]> pickFileRequest;
    @Override
    public void onCreatePreferences(@Nullable Bundle savedInstanceState, @Nullable String rootKey) {
        setPreferencesFromResource(R.xml.general_preference, rootKey);
@ -21,6 +30,11 @@ public class GeneralPreferences extends BasePreferenceFragment {
        setItemClick("behavior.pictureInPicture", (pref)-> GlobalConfig.set(GlobalConfig.KEY_PICTURE_IN_PICTURE, ((SwitchPreferenceCompat)pref).isChecked()));
        setActivityTitle(R.string.general);
        refresh();
+
+        setItemClick("games.aes_key", pref -> pickFileRequest.launch(new String[]{ "text/plain" }));
+        setItemClick("games.seed_db", pref -> pickFileRequest.launch(new String[]{ "application/octet-stream" }));
+
+        pickFileRequest = registerForActivityResult(openFolderContract, this);
    }

    @Override
@ -31,5 +45,45 @@ public class GeneralPreferences extends BasePreferenceFragment {

    private void refresh() {
        setSwitchValue("behavior.pictureInPicture", GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE));
+        setSummaryValue("games.aes_key", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/aes_keys.txt") ? R.string.file_available : R.string.file_not_available), "aes_keys.txt"));
+        setSummaryValue("games.seed_db", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/seeddb.bin") ? R.string.file_available : R.string.file_not_available), "seeddb.bin"));
    }
+
+	@Override
+	public void onDestroy() {
+		super.onDestroy();
+		if (pickFileRequest != null) {
+			pickFileRequest.unregister();
+			pickFileRequest = null;
+		}
+	}
+
+	@Override
+	public void onActivityResult(Uri result) {
+		if (result != null) {
+			String path = result.toString();
+			Log.w("File", path + " -> " + FileUtils.getName(path));
+			switch (String.valueOf(FileUtils.getName(path))) {
+				case "aes_keys.txt":
+				case "seeddb.bin": {
+					String name = FileUtils.getName(path);
+					if (FileUtils.getLength(path) < 1024 * 256) {
+						String sysdataFolder = FileUtils.getPrivatePath() + "/sysdata";
+						if (!FileUtils.exists(sysdataFolder)) {
+							FileUtils.createDir(FileUtils.getPrivatePath(), "sysdata");
+						}
+						if (FileUtils.exists(sysdataFolder + "/" + name)) {
+							FileUtils.delete(sysdataFolder + "/" + name);
+						}
+						FileUtils.copyFile(path, FileUtils.getPrivatePath() + "/sysdata/", name);
+						Toast.makeText(getActivity(), String.format(getString(R.string.file_imported), name), Toast.LENGTH_LONG).show();
+					} else {
+						Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show();
+					}
+				} break;
+				default: Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show(); break;
+			}
+			refresh();
+		}
+	}
 }
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/preferences/screen_editor/ScreenEditorPreference.java
@ -23,7 +23,7 @@ public class ScreenEditorPreference extends Fragment {
    @Override
    public View onCreateView(@NonNull LayoutInflater inflater, @Nullable ViewGroup container, @Nullable Bundle savedInstanceState) {
        layout = new LinearLayout(container.getContext());
-        layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_HIDE_NAVIGATION|View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
+        layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
        return layout;
    }

--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/app/provider/AppDataDocumentProvider.java
@ -95,7 +95,7 @@ public class AppDataDocumentProvider extends DocumentsProvider {
    private void includeFile(MatrixCursor cursor, File file) {
        int flags = 0;
        if (file.isDirectory()) {
-            flags = Document.FLAG_DIR_SUPPORTS_CREATE;
+            flags = Document.FLAG_DIR_SUPPORTS_CREATE | Document.FLAG_SUPPORTS_DELETE;
        } else {
            flags = Document.FLAG_SUPPORTS_WRITE | Document.FLAG_SUPPORTS_REMOVE | Document.FLAG_SUPPORTS_DELETE;
        }
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/data/config/GlobalConfig.java
@ -22,6 +22,7 @@ public class GlobalConfig {
    public static DataModel data;

    public static final Key<Boolean> KEY_SHADER_JIT = new Key<>("emu.shader_jit", true);
+    public static final Key<Boolean> KEY_ACCURATE_SHADER_MULTIPLY = new Key<>("emu.accurate_shader_mul", false);
    public static final Key<Boolean> KEY_PICTURE_IN_PICTURE = new Key<>("app.behavior.pictureInPicture", false);
    public static final Key<Boolean> KEY_SHOW_PERFORMANCE_OVERLAY = new Key<>("dev.performanceOverlay", false);
    public static final Key<Boolean> KEY_LOGGER_SERVICE = new Key<>("dev.loggerService", false);
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Quaternion.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Quaternion.java
@ -0,0 +1,31 @@
+package com.panda3ds.pandroid.math;
+
+public class Quaternion {
+	public float x, y, z, w;
+	public Quaternion(float x, float y, float z, float w) {
+		this.x = x;
+		this.y = y;
+		this.z = z;
+		this.w = w;
+	}
+
+	public Quaternion fromEuler(Vector3 euler) {
+		float x = euler.x;
+		float y = euler.y;
+		float z = euler.z;
+
+		double c1 = Math.cos(x / 2.0);
+		double c2 = Math.cos(y / 2.0);
+		double c3 = Math.cos(z / 2.0);
+
+		double s1 = Math.sin(x / 2.0);
+		double s2 = Math.sin(y / 2.0);
+		double s3 = Math.sin(z / 2.0);
+
+		this.x = (float) (s1 * c2 * c3 + c1 * s2 * s3);
+		this.y = (float) (c1 * s2 * c3 - s1 * c2 * s3);
+		this.z = (float) (c1 * c2 * s3 + s1 * s2 * c3);
+		this.w = (float) (c1 * c2 * c3 - s1 * s2 * s3);
+		return this;
+	}
+}
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Vector3.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/math/Vector3.java
@ -0,0 +1,32 @@
+package com.panda3ds.pandroid.math;
+
+public class Vector3 {
+	private final Quaternion quaternion = new Quaternion(0, 0, 0, 0);
+	public float x, y, z;
+
+	public Vector3(float x, float y, float z) {
+		this.x = x;
+		this.y = y;
+		this.z = z;
+	}
+
+	public Vector3 rotateByEuler(Vector3 euler) {
+		this.quaternion.fromEuler(euler);
+
+		float x = this.x, y = this.y, z = this.z;
+		float qx = this.quaternion.x;
+		float qy = this.quaternion.y;
+		float qz = this.quaternion.z;
+		float qw = this.quaternion.w;
+
+		float ix = qw * x + qy * z - qz * y;
+		float iy = qw * y + qz * x - qx * z;
+		float iz = qw * z + qx * y - qy * x;
+		float iw = -qx * x - qy * qz * z;
+
+		this.x = ix * qw + iw * -qx + iy * -qz - iz * -qy;
+		this.y = iy * qw + iw * -qy + iz * -qx - ix * -qz;
+		this.z = iz * qw + iw * -qz + ix * -qy - iy * -qx;
+		return this;
+	}
+}
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/utils/FileUtils.java
@ -230,6 +230,10 @@ public class FileUtils {
        return parseFile(path).lastModified();
    }

+    public static long getLength(String path) {
+        return parseFile(path).length();
+    }
+
    public static String[] listFiles(String path) {
        DocumentFile folder = parseFile(path);
        DocumentFile[] files = folder.listFiles();
--- a/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java
+++ b/src/pandroid/app/src/main/java/com/panda3ds/pandroid/view/PandaGlRenderer.java
@ -93,6 +93,7 @@ public class PandaGlRenderer implements GLSurfaceView.Renderer, ConsoleRenderer

 		AlberDriver.Initialize();
 		AlberDriver.setShaderJitEnabled(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
+		AlberDriver.setAccurateShaderMulEnable(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));

 		// If loading the ROM failed, display an error message and early exit
 		if (!AlberDriver.LoadRom(romPath)) {
--- a/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml
+++ b/src/pandroid/app/src/main/res/values-pt-rBR/strings.xml
@ -90,4 +90,12 @@
    <string name="behavior">Comportamento</string>
    <string name="invalid_game">Jogo invalido</string>
    <string name="tools">Ferramentas</string>
+    <string name="pref_accurate_shader_title">Multiplicação precisa de shader</string>
+    <string name="pref_accurate_shader_summary">Usar calculos mais precisos para shaders</string>
+    <string name="pref_game_crypto_keys">Importar chaves</string>
+    <string name="file_available">%s disponível</string>
+    <string name="file_not_available">%s não disponível</string>
+    <string name="pref_game_seed_db_keys">Importar SeedDB</string>
+    <string name="invalid_file">Arquivo inválido</string>
+    <string name="file_imported">%s Importado</string>
 </resources>
--- a/src/pandroid/app/src/main/res/values/strings.xml
+++ b/src/pandroid/app/src/main/res/values/strings.xml
@ -96,4 +96,12 @@
    <string name="region_taiwan">Taiwan</string>
    <string name="behavior">Behavior</string>
    <string name="invalid_game">Invalid game</string>
+    <string name="pref_accurate_shader_title">Accurate shader multiplication</string>
+    <string name="pref_accurate_shader_summary">Can improve rendering at a small performance loss</string>
+    <string name="pref_game_crypto_keys">Import keys</string>
+    <string name="file_imported">%s imported</string>
+    <string name="file_available">%s available</string>
+    <string name="file_not_available">%s not available</string>
+    <string name="pref_game_seed_db_keys">Import SeedDB</string>
+    <string name="invalid_file">Invalid file</string>
 </resources>
--- a/src/pandroid/app/src/main/res/xml/advanced_preferences.xml
+++ b/src/pandroid/app/src/main/res/xml/advanced_preferences.xml
@ -28,5 +28,11 @@
            app:summary="@string/pref_shader_jit_summary"
            app:iconSpaceReserved="false"/>

+        <SwitchPreferenceCompat
+            app:key="accurateShaderMul"
+            app:title="@string/pref_accurate_shader_title"
+            app:summary="@string/pref_accurate_shader_summary"
+            app:iconSpaceReserved="false"/>
+
    </PreferenceCategory>
 </PreferenceScreen>
--- a/src/pandroid/app/src/main/res/xml/general_preference.xml
+++ b/src/pandroid/app/src/main/res/xml/general_preference.xml
@ -23,6 +23,16 @@
            app:title="@string/pref_game_folders"
            app:summary="@string/pref_game_folders_summary"
            app:iconSpaceReserved="false"/>
+        <Preference
+            android:key="games.aes_key"
+            app:title="@string/pref_game_crypto_keys"
+            app:summary="@string/pref_game_crypto_keys"
+            app:iconSpaceReserved="false"/>
+        <Preference
+            android:key="games.seed_db"
+            app:title="@string/pref_game_seed_db_keys"
+            app:summary="@string/pref_game_crypto_keys"
+            app:iconSpaceReserved="false"/>
    </PreferenceCategory>
    <PreferenceCategory
        app:title="@string/behavior"
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@ -0,0 +1,288 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gl/stream_buffer.h"
+
+#include <array>
+#include <cstdio>
+
+#include "align.hpp"
+
+OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
+OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
+
+void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
+void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
+
+void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
+#ifdef GPU_DEBUG_INFO
+	if (glObjectLabel) {
+		glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
+	}
+#endif
+}
+
+namespace {
+	// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
+	class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
+	class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Base class for implementations which require syncing.
+	class SyncingStreamBuffer : public OpenGLStreamBuffer {
+	  public:
+		enum : u32 { NUM_SYNC_POINTS = 16 };
+
+		virtual ~SyncingStreamBuffer() override {
+			for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
+				glDeleteSync(m_sync_objects[i]);
+			}
+		}
+
+	  protected:
+		SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+			: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
+
+		ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
+
+		ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
+			const u32 end = GetSyncIndexForOffset(offset);
+			for (; m_used_block_index < end; m_used_block_index++) {
+				if (m_sync_objects[m_used_block_index]) {
+					Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
+				}
+
+				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}
+		}
+
+		ALWAYS_INLINE void WaitForSync(GLsync& sync) {
+			glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+			glDeleteSync(sync);
+			sync = nullptr;
+		}
+
+		ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
+			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
+			for (; m_available_block_index < end; m_available_block_index++) {
+				if (!m_sync_objects[m_available_block_index]) [[unlikely]] {
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
+				}
+
+				WaitForSync(m_sync_objects[m_available_block_index]);
+			}
+		}
+
+		void AllocateSpace(u32 size) {
+			// add sync objects for writes since the last allocation
+			AddSyncsForOffset(m_position);
+
+			// wait for sync objects for the space we want to use
+			EnsureSyncsWaitedForOffset(m_position + size);
+
+			// wrap-around?
+			if ((m_position + size) > m_size) {
+				// current position ... buffer end
+				AddSyncsForOffset(m_size);
+
+				// rewind, and try again
+				m_position = 0;
+
+				// wait for the sync at the start of the buffer
+				WaitForSync(m_sync_objects[0]);
+				m_available_block_index = 1;
+
+				// and however much more we need to satisfy the allocation
+				EnsureSyncsWaitedForOffset(size);
+				m_used_block_index = 0;
+			}
+		}
+
+		u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
+
+		u32 m_position = 0;
+		u32 m_used_block_index = 0;
+		u32 m_available_block_index = NUM_SYNC_POINTS;
+		u32 m_bytes_per_block;
+		std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
+	};
+
+	class BufferStorageStreamBuffer : public SyncingStreamBuffer {
+	  public:
+		~BufferStorageStreamBuffer() override {
+			glBindBuffer(m_target, m_buffer_id);
+			glUnmapBuffer(m_target);
+			glBindBuffer(m_target, 0);
+		}
+
+		MappingResult Map(u32 alignment, u32 min_size) override {
+			if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
+
+			AllocateSpace(min_size);
+			if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
+			return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
+		}
+
+		u32 Unmap(u32 used_size) override {
+			if ((m_position + used_size) > m_size) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			if (!m_coherent) {
+				if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
+					glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
+				} else {
+					Bind();
+					glFlushMappedBufferRange(m_target, m_position, used_size);
+				}
+			}
+
+			const u32 prev_position = m_position;
+			m_position += used_size;
+			return prev_position;
+		}
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+
+			const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+			const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
+			if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
+				glBufferStorage(target, size, nullptr, flags);
+			else if (GLAD_GL_EXT_buffer_storage)
+				glBufferStorageEXT(target, size, nullptr, flags);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
+			AssertMsg(mapped_ptr, "Persistent buffer was mapped");
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
+		}
+
+	  private:
+		BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
+			: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
+
+		u8* m_mapped_ptr;
+		bool m_coherent;
+	};
+
+}  // namespace
+
+std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
+	std::unique_ptr<OpenGLStreamBuffer> buf;
+	if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
+		buf = BufferStorageStreamBuffer::Create(target, size);
+		if (buf) return buf;
+	}
+
+	// BufferSubData is slower on all drivers except NVIDIA...
+#if 0
+	const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+	if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
+		// Mali and Adreno drivers can't do sub-buffer tracking...
+		return BufferDataStreamBuffer::Create(target, size);
+	}
+
+	return BufferSubDataStreamBuffer::Create(target, size);
+#else
+	return BufferDataStreamBuffer::Create(target, size);
+#endif
+}
--- a/third_party/duckstation/gl/stream_buffer.h
+++ b/third_party/duckstation/gl/stream_buffer.h
@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <glad/gl.h>
+// Comment to avoid clang-format reordering the glad header
+
+#include <memory>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include "duckstation_compat.h"
+#include "helpers.hpp"
+
+class OpenGLStreamBuffer {
+  public:
+	virtual ~OpenGLStreamBuffer();
+
+	ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+	ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+	ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+	void Bind();
+	void Unbind();
+
+	void SetDebugName(std::string_view name);
+
+	struct MappingResult {
+		void* pointer;
+		u32 buffer_offset;
+		u32 index_aligned;  // offset / alignment, suitable for base vertex
+		u32 space_aligned;  // remaining space / alignment
+	};
+
+	virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
+
+	/// Returns the position in the buffer *before* the start of used_size.
+	virtual u32 Unmap(u32 used_size) = 0;
+
+	/// Returns the minimum granularity of blocks which sync objects will be created around.
+	virtual u32 GetChunkSize() const = 0;
+
+	static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
+
+  protected:
+	OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+	GLenum m_target;
+	GLuint m_buffer_id;
+	u32 m_size;
+};
--- a/third_party/fmt
+++ b/third_party/fmt
@ -0,0 +1 @@
+Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4
				`@ -0,0 +1 @@`
				`Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4`