Moar shader decompiler (#559)

* Renderer: Add prepareForDraw callback * Add fmt submodule and port shader decompiler instructions to it * Add shader acceleration setting * Hook up vertex shaders to shader cache * Shader decompiler: Fix redundant compilations * Shader Decompiler: Fix vertex attribute upload * Shader compiler: Simplify generated code for reading and faster compilation * Further simplify shader decompiler output * Shader decompiler: More smallen-ing * Shader decompiler: Get PICA uniforms uploaded to the GPU * Shader decompiler: Readd clipping * Shader decompiler: Actually `break` on control flow instructions * Shader decompiler: More control flow handling * Shader decompiler: Fix desitnation mask * Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) * Disgusting changes to handle the fact that hw shader shaders are 2x as big * Shader decompiler: Implement proper output semantic mapping * Moar instructions * Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI * Shader decompiler: Add register indexing * Shader decompiler: Optimize mova with both x and y masked * Shader decompiler: Add DPH/DPHI * Fix shader caching being broken * PICA decompiler: Cache VS uniforms * Simply vertex cache code * Simplify vertex cache code * Shader decompiler: Add loops * Shader decompiler: Implement safe multiplication * Shader decompiler: Implement LG2/EX2 * Shader decompiler: More control flow * Shader decompiler: Fix JMPU condition * Shader decompiler: Convert main function to void * PICA: Start implementing GPU vertex fetch * More hw VAO work * More hw VAO work * More GPU vertex fetch code * Add GL Stream Buffer from Duckstation * GL: Actually upload data to stream buffers * GPU: Cleanup immediate mode handling * Get first renders working with accelerated draws * Shader decompiler: Fix control flow analysis bugs * HW shaders: Accelerate indexed draws * Shader decompiler: Add support for compilation errors * GLSL decompiler: Fall back for LITP * Add Renderdoc scope classes * Fix control flow analysis bug * HW shaders: Fix attribute fetch * Rewriting hw vertex fetch * Stream buffer: Fix copy-paste mistake * HW shaders: Fix indexed rendering * HW shaders: Add padding attributes * HW shaders: Avoid redundant glVertexAttrib4f calls * HW shaders: Fix loops * HW shaders: Make generated shaders slightly smaller * Fix libretro build * HW shaders: Fix android * Remove redundant ubershader checks * Set accelerate shader default to true * Shader decompiler: Don't declare VS input attributes as an array * Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again * fix merge conflict bug
2025-06-07 19:41:38 +12:00 · 2024-10-19 16:53:51 +03:00 · 2024-10-19 16:53:51 +03:00 · 49a94a13c5
commit 49a94a13c5
parent afaf18f124
34 changed files with 1877 additions and 253 deletions
--- a/src/config.cpp
+++ b/src/config.cpp
@ -67,6 +67,7 @@ void EmulatorConfig::load() {
 			vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
 			useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
 			accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
+			accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);

 			forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
 			lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
@ -141,6 +142,7 @@ void EmulatorConfig::save() {
 	data["GPU"]["UseUbershaders"] = useUbershaders;
 	data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
 	data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
+	data["GPU"]["AccelerateShaders"] = accelerateShaders;
 	data["GPU"]["EnableRenderdoc"] = enableRenderdoc;

 	data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@ -0,0 +1,148 @@
+#include "PICA/draw_acceleration.hpp"
+
+#include <bit>
+#include <limits>
+
+#include "PICA/gpu.hpp"
+#include "PICA/regs.hpp"
+
+void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
+	accel.indexed = indexed;
+	accel.totalAttribCount = totalAttribCount;
+	accel.enabledAttributeMask = 0;
+	
+	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
+
+	if (indexed) {
+		u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
+		u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
+
+		u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Check whether the index buffer uses u16 indices or u8
+		accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if (accel.useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (int i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (int i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		accel.indexBuffer = indexBuffer;
+		accel.minimumIndex = minimumIndex;
+		accel.maximumIndex = maximumIndex;
+	} else {
+		accel.indexBuffer = nullptr;
+		accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
+		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
+	}
+
+	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
+	const u64 inputAttrCfg = getVertexShaderInputConfig();
+
+	u32 attrCount = 0;
+	u32 loaderOffset = 0;
+	accel.vertexDataSize = 0;
+	accel.totalLoaderCount = 0;
+
+	for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
+		auto& loaderData = attributeInfo[i];  // Get information for this attribute loader
+
+		// This loader is empty, skip it
+		if (loaderData.componentCount == 0 || loaderData.size == 0) {
+			continue;
+		}
+
+		auto& loader = accel.loaders[accel.totalLoaderCount++];
+
+		// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
+		// Which is equal to maximumIndex - minimumIndex + 1
+		const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
+		loader.size = bytes;
+
+		// Add it to the total vertex data size, aligned to 4 bytes.
+		accel.vertexDataSize += (bytes + 3) & ~3;
+		
+		// Get a pointer to the data where this loader's data is stored
+		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
+		loader.data = getPointerPhys<u8>(loaderAddress);
+
+		u64 attrCfg = loaderData.getConfigFull();  // Get config1 | (config2 << 32)
+		u32 attributeOffset = 0;
+
+		for (int component = 0; component < loaderData.componentCount; component++) {
+			uint attributeIndex = (attrCfg >> (component * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+			// Vertex attributes used as padding
+			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+			if (attributeIndex >= 12) [[unlikely]] {
+				// Align attribute address up to a 4 byte boundary
+				attributeOffset = (attributeOffset + 3) & -4;
+				attributeOffset += (attributeIndex - 11) << 2;
+				continue;
+			}
+
+			const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
+			const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+			const u32 size = (attribInfo >> 2) + 1;   // Total number of components
+
+			// Size of each component based on the attribute type
+			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
+			// Mark the attribute as enabled
+			accel.enabledAttributeMask |= 1 << inputReg;
+
+			auto& attr = accel.attributeInfo[inputReg];
+			attr.componentCount = size;
+			attr.offset = attributeOffset + loaderOffset;
+			attr.stride = loaderData.size;
+			attr.type = attribType;
+			attributeOffset += size * sizePerComponent[attribType];
+		}
+
+		loaderOffset += loader.size;
+	}
+
+	u32 fixedAttributes = fixedAttribMask;
+	accel.fixedAttributes = 0;
+
+	// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
+	while (fixedAttributes != 0) {
+		// Get index of next fixed attribute and turn it off
+		const u32 index = std::countr_zero<u32>(fixedAttributes);
+		const u32 mask = 1u << index;
+		fixedAttributes ^= mask;
+
+		// PICA register this fixed attribute is meant to go to
+		const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
+		const u32 inputRegMask = 1u << inputReg;
+
+		// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
+		if ((accel.enabledAttributeMask & inputRegMask) == 0) {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
+			auto& attr = accel.attributeInfo[inputReg];
+
+			accel.fixedAttributes |= inputRegMask;
+
+			for (int i = 0; i < 4; i++) {
+				attr.fixedValue[i] = fixedAttr[i].toFloat32();
+			}
+		}
+	}
+
+	accel.canBeAccelerated = true;
+}
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -117,37 +117,62 @@ void GPU::reset() {
 	externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
 	externalRegs[Framebuffer1Select] = 0;

-	renderer->setUbershaderSetting(config.useUbershaders);
 	renderer->reset();
 }

-// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
-// And whether we are going to use the shader JIT (second template parameter)
-void GPU::drawArrays(bool indexed) {
-	const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
-
-	if (indexed) {
-		if (shaderJITEnabled)
-			drawArrays<true, true>();
-		else
-			drawArrays<true, false>();
-	} else {
-		if (shaderJITEnabled)
-			drawArrays<false, true>();
-		else
-			drawArrays<false, false>();
-	}
-}
-
 static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;

-template <bool indexed, bool useShaderJIT>
-void GPU::drawArrays() {
-	if constexpr (useShaderJIT) {
-		shaderJIT.prepare(shaderUnit.vs);
+// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
+// And whether we are going to use the shader JIT (second template parameter)
+void GPU::drawArrays(bool indexed) {
+	PICA::DrawAcceleration accel;
+
+	if (config.accelerateShaders) {
+		// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
+		// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on 
+		getAcceleratedDrawInfo(accel, indexed);
 	}

-	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
+
+	if (hwShaders) {
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
+		// Total # of vertices to render
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
+
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
+		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
+	} else {
+		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
+
+		if (indexed) {
+			if (shaderJITEnabled) {
+				drawArrays<true, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<true, ShaderExecMode::Interpreter>();
+			}
+		} else {
+			if (shaderJITEnabled) {
+				drawArrays<false, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<false, ShaderExecMode::Interpreter>();
+			}
+		}
+	}
+}
+
+template <bool indexed, ShaderExecMode mode>
+void GPU::drawArrays() {
+	if constexpr (mode == ShaderExecMode::JIT) {
+		shaderJIT.prepare(shaderUnit.vs);
+	} else if constexpr (mode == ShaderExecMode::Hardware) {
+		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
+		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
+	}
+
+	// We can have up to 16 attributes, each one consisting of 4 floats
+	constexpr u32 maxAttrSizeInFloats = 16 * 4;

 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@ -312,8 +337,6 @@ void GPU::drawArrays() {
 					}

 					// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
-					// Corgi does this although I'm not sure if it's actually needed for anything.
-					// TODO: Find out
 					while (component < 4) {
 						attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
 						component++;
@ -327,13 +350,13 @@ void GPU::drawArrays() {

 		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
 		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-		// Ie it might attribute #0 to v2, #1 to v7, etc
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
 		for (int j = 0; j < totalAttribCount; j++) {
 			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}

-		if constexpr (useShaderJIT) {
+		if constexpr (mode == ShaderExecMode::JIT) {
 			shaderJIT.run(shaderUnit.vs);
 		} else {
 			shaderUnit.vs.run();
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
+							renderer->prepareForDraw(shaderUnit, nullptr);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);

 							switch (primType) {
@ -300,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		}

 		case VertexBoolUniform: {
-			shaderUnit.vs.boolUniform = value & 0xffff;
+			shaderUnit.vs.uploadBoolUniform(value & 0xffff);
 			break;
 		}

--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@ -1,5 +1,10 @@
 #include "PICA/shader_decompiler.hpp"

+#include <fmt/format.h>
+
+#include <array>
+#include <cassert>
+
 #include "config.hpp"

 using namespace PICA;
@ -13,11 +18,45 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
 	analysisFailed = false;

 	const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
-	if (function == nullptr) {
+	if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
 		analysisFailed = true;
 	}
 }

+// Helpers for merging parallel/series exit methods from Citra
+// Merges exit method of two parallel branches.
+static ExitMode exitParallel(ExitMode a, ExitMode b) {
+	if (a == ExitMode::Unknown) {
+		return b;
+	}
+	else if (b == ExitMode::Unknown) {
+		return a;
+	}
+	else if (a == b) {
+		return a;
+	}
+	return ExitMode::Conditional;
+}
+
+// Cascades exit method of two blocks of code.
+static ExitMode exitSeries(ExitMode a, ExitMode b) {
+	assert(a != ExitMode::AlwaysEnd);
+
+	if (a == ExitMode::Unknown) {
+		return ExitMode::Unknown;
+	}
+
+	if (a == ExitMode::AlwaysReturn) {
+		return b;
+	}
+
+	if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
+		return ExitMode::AlwaysEnd;
+	}
+
+	return ExitMode::Conditional;
+}
+
 ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
 	// Initialize exit mode to unknown by default, in order to detect things like unending loops
 	auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
@ -32,25 +71,132 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		const u32 opcode = instruction >> 26;

 		switch (opcode) {
-			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
-			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
-			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
-			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
-			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
-			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
-			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
-			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
-			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
+			case ShaderOpcodes::JMPC:
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				// Register this jump address to our outLabels set
+				labels.insert(dest);

+				// This opens up 2 parallel paths of execution
+				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
+				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
+				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
+				return it->second;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+
+				const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Next analyze the not taken func
+				ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
+				if (num != 0) {
+					const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
+					// Check if analysis failed and return unknown if it did
+					if (analysisFailed) {
+						it->second = ExitMode::Unknown;
+						return it->second;
+					}
+
+					branchNotTakenExitMode = branchNotTakenFunc->exitMode;
+				}
+
+				auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
+				// Both branches of the if/else end, so there's nothing after the call
+				if (parallel == ExitMode::AlwaysEnd) {
+					it->second = parallel;
+					return it->second;
+				} else {
+					ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
+					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
+					it->second = conditionalExitMode;
+					return it->second;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::CALL: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				u32 dest = getBits<10, 12>(instruction);
+				const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
+				const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
+				it->second = exitMode;
+				return it->second;
+			}
+
+			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 			default: break;
 		}
 	}

 	// A function without control flow instructions will always reach its "return point" and return
-	return ExitMode::AlwaysReturn;
+	it->second = ExitMode::AlwaysReturn;
+	return it->second;
 }

-void ShaderDecompiler::compileRange(const AddressRange& range) {
+std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
 	u32 pc = range.start;
 	const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
 	bool finished = false;
@ -58,6 +204,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
 	while (pc < end && !finished) {
 		compileInstruction(pc, finished);
 	}
+
+	return std::make_pair(pc, finished);
 }

 const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
@ -71,20 +219,43 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 }

 void ShaderDecompiler::writeAttributes() {
+	// Annoyingly, GLES does not support having an array as an input attribute, so declare each attribute separately for now
 	decompiledShader += R"(
-		layout(location = 0) in vec4 inputs[8];
+	layout(location = 0) in vec4 attr0;
+	layout(location = 1) in vec4 attr1;
+	layout(location = 2) in vec4 attr2;
+	layout(location = 3) in vec4 attr3;
+	layout(location = 4) in vec4 attr4;
+	layout(location = 5) in vec4 attr5;
+	layout(location = 6) in vec4 attr6;
+	layout(location = 7) in vec4 attr7;
+	layout(location = 8) in vec4 attr8;
+	layout(location = 9) in vec4 attr9;
+	layout(location = 10) in vec4 attr10;
+	layout(location = 11) in vec4 attr11;
+	layout(location = 12) in vec4 attr12;
+	layout(location = 13) in vec4 attr13;
+	layout(location = 14) in vec4 attr14;
+	layout(location = 15) in vec4 attr15;

-		layout(std140) uniform PICAShaderUniforms {
-			vec4 uniform_float[96];
-			uvec4 uniform_int;
-			uint uniform_bool;
-		};
-	
-		vec4 temp_registers[16];
-		vec4 dummy_vec = vec4(0.0);
+	layout(std140) uniform PICAShaderUniforms {
+		vec4 uniform_f[96];
+		uvec4 uniform_i;
+		uint uniform_bool;
+	};
+
+	vec4 temp[16];
+	vec4 out_regs[16];
+	vec4 dummy_vec = vec4(0.0);
+	ivec3 addr_reg = ivec3(0);
+	bvec2 cmp_reg = bvec2(false);
+
+	vec4 uniform_indexed(int source, int offset) {
+		int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
+		uint index = uint(clipped_offs + source) & 127u;
+		return (index < 96u) ? uniform_f[index] : vec4(1.0);
+	}
 )";
-
-	decompiledShader += "\n";
 }

 std::string ShaderDecompiler::decompile() {
@ -94,11 +265,14 @@ std::string ShaderDecompiler::decompile() {
 		return "";
 	}

-	decompiledShader = "";
+	compilationError = false;
+	decompiledShader.clear();
+	// Reserve some memory for the shader string to avoid memory allocations
+	decompiledShader.reserve(256 * 1024);

 	switch (api) {
 		case API::GL: decompiledShader += "#version 410 core\n"; break;
-		case API::GLES: decompiledShader += "#version 300 es\n"; break;
+		case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
 		default: break;
 	}

@ -109,7 +283,7 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += R"(
 			vec4 safe_mul(vec4 a, vec4 b) {
 				vec4 res = a * b;
-				return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
+				return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
 			}
 		)";
 	}
@ -121,17 +295,61 @@ std::string ShaderDecompiler::decompile() {

 	decompiledShader += "void pica_shader_main() {\n";
 	AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
-	callFunction(*findFunction(mainFunctionRange));
-	decompiledShader += "}\n";
+	auto mainFunc = findFunction(mainFunctionRange);

-	for (auto& func : controlFlow.functions) {
-		if (func.outLabels.size() > 0) {
-			Helpers::panic("Function with out labels");
+	decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
+
+	for (const Function& func : controlFlow.functions) {
+		if (func.outLabels.empty()) {
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+
+			auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
+			if (!finished) {
+				decompiledShader += "return false;";
+			}
+
+			decompiledShader += "}\n";
+		} else {
+			auto labels = func.outLabels;
+			labels.insert(func.start);
+
+			// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
+			// current PC
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+			decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
+			decompiledShader += "while(true){\nswitch(pc){\n";
+
+			for (u32 label : labels) {
+				decompiledShader += fmt::format("case {}u: {{", label);
+				// Fetch the next label whose address > label
+				auto it = labels.lower_bound(label + 1);
+				u32 next = (it == labels.end()) ? func.end : *it;
+
+				auto [endPC, finished] = compileRange(AddressRange(label, next));
+				if (endPC > next && !finished) {
+					labels.insert(endPC);
+					decompiledShader += fmt::format("pc = {}u; break;", endPC);
+				}
+
+				// Fallthrough to next label
+				decompiledShader += "}\n";
+			}
+
+			decompiledShader += "default: return false;\n";
+			// Exit the switch and loop
+			decompiledShader += "} }\n";
+
+			// Exit the function
+			decompiledShader += "return false;\n";
+			decompiledShader += "}\n";
 		}
+	}

-		decompiledShader += "void " + func.getIdentifier() + "() {\n";
-		compileRange(AddressRange(func.start, func.end));
-		decompiledShader += "}\n";
+	// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
+	// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
+	// and the renderer core will decide to use CPU shaders instead
+	if (compilationError) [[unlikely]] {
+		return "";
 	}

 	return decompiledShader;
@ -139,30 +357,41 @@ std::string ShaderDecompiler::decompile() {

 std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
 	if (source < 0x10) {
-		return "inputs[" + std::to_string(source) + "]";
+		return "attr" + std::to_string(source);
 	} else if (source < 0x20) {
-		return "temp_registers[" + std::to_string(source - 0x10) + "]";
+		return "temp[" + std::to_string(source - 0x10) + "]";
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;

-		if (floatIndex >= 96) [[unlikely]] {
-			return "dummy_vec";
+		if (index == 0) {
+			if (floatIndex >= 96) [[unlikely]] {
+				return "dummy_vec";
+			}
+			return "uniform_f[" + std::to_string(floatIndex) + "]";
+		} else {
+			static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
+			return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
 		}
-		return "uniform_float[" + std::to_string(floatIndex) + "]";
 	}
 }

 std::string ShaderDecompiler::getDest(u32 dest) const {
 	if (dest < 0x10) {
-		return "output_registers[" + std::to_string(dest) + "]";
+		return "out_regs[" + std::to_string(dest) + "]";
 	} else if (dest < 0x20) {
-		return "temp_registers[" + std::to_string(dest - 0x10) + "]";
+		return "temp[" + std::to_string(dest - 0x10) + "]";
 	} else {
 		return "dummy_vec";
 	}
 }

 std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
+	// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
+	static constexpr uint noSwizzle = 0x1B;
+	if (swizzle == noSwizzle) {
+		return "";
+	}
+
 	static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
 	std::string ret(".    ");
 	
@ -176,7 +405,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {

 std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
 	std::string ret = ".";
-	
 	if (destinationMask & 0b1000) {
 		ret += "x";
 	}
@ -208,11 +436,12 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 		return;
 	}

-	decompiledShader += dest + destSwizzle + " = ";
-	if (writtenLaneCount == 1) {
-		decompiledShader += "float(" + value + ");\n";
-	} else {
-		decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
+	// Don't write destination swizzle if all lanes are getting written to
+	decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
+	if (writtenLaneCount <= 3) {
+		decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
+	} else if (writtenLaneCount == 4) {
+		decompiledShader += fmt::format("{};\n", value);
 	}
 }

@ -246,26 +475,101 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {

 		std::string dest = getDest(destIndex);

-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
-		}
-
-		if (invertSources) {
-			Helpers::panic("GLSL recompiler: Inverted instruction");
-		}
-
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
-			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
-			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
-			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
-			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
+			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
+			case ShaderOpcodes::MUL:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
+				} else {
+					setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
+			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;

-			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
-			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
-			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
+			case ShaderOpcodes::DP3:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::DP4:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
+			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
+			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
+			case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
+			case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;

-			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
+			case ShaderOpcodes::SLT:
+			case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::SGE:
+			case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::DPH:
+			case ShaderOpcodes::DPHI:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
+				}
+				break;
+
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
+				static constexpr std::array<const char*, 8> operators = {
+					// The last 2 operators always return true and are handled specially
+					"==", "!=", "<", "<=", ">", ">=", "", "",
+				};
+
+				const u32 cmpY = getBits<21, 3>(instruction);
+				const u32 cmpX = getBits<24, 3>(instruction);
+
+				// Compare x first
+				if (cmpX >= 6) {
+					decompiledShader += "cmp_reg.x = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
+				}
+
+				// Then compare Y
+				if (cmpY >= 6) {
+					decompiledShader += "cmp_reg.y = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
+				}
+				break;
+			}
+
+			case ShaderOpcodes::MOVA: {
+				const bool writeX = getBit<3>(operandDescriptor);  // Should we write the x component of the address register?
+				const bool writeY = getBit<2>(operandDescriptor);
+
+				if (writeX && writeY) {
+					decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
+				} else if (writeX) {
+					decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
+				} else if (writeY) {
+					decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
+				}
+				break;
+			}
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
 		const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
@ -299,23 +603,156 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		src3 += getSwizzlePattern(swizzle3);

 		std::string dest = getDest(destIndex);
-
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
+		if (!config.accurateShaderMul) {
+			setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
+		} else {
+			setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
 		}
-
-		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
 	} else {
 		switch (opcode) {
-			case ShaderOpcodes::END: finished = true; return;
-			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
+			case ShaderOpcodes::JMPC: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 condOp = getBits<22, 2>(instruction);
+				const uint refY = getBit<24>(instruction);
+				const uint refX = getBit<25>(instruction);
+				const char* condition = getCondition(condOp, refX, refY);
+
+				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
+				break;
+			}
+
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+				const u32 mask = 1u << bit;
+				const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we jump if bit = 1, otherwise 0
+
+				decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
+				break;
+			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
+
+				if (opcode == ShaderOpcodes::IFC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*conditionalFunc);
+				decompiledShader += "}\n";
+
+				pc = dest;
+				if (num > 0) {
+					const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
+					pc = dest + num;
+
+					decompiledShader += "else { ";
+					callFunction(*elseFunc);
+					decompiledShader += "}\n";
+
+					if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
+						finished = true;
+						return;
+					}
+				}
+
+				return;
+			}
+
+			case ShaderOpcodes::CALL:
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
+
+				// Handle conditions for CALLC/CALLU
+				if (opcode == ShaderOpcodes::CALLC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else if (opcode == ShaderOpcodes::CALLU) {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*calledFunc);
+
+				// Close brackets for CALLC/CALLU
+				if (opcode != ShaderOpcodes::CALL) {
+					decompiledShader += "}";
+				}
+
+				if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::LOOP: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 uniformIndex = getBits<22, 2>(instruction);
+
+				// loop counter = uniform.y
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format(
+					"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
+					"16u) & 0xFFu)) {{\n",
+					pc, pc, uniformIndex, pc, uniformIndex
+				);
+
+				AddressRange range(pc + 1, dest + 1);
+				const Function* func = findFunction(range);
+				callFunction(*func);
+				decompiledShader += "}\n";
+
+				// Jump to the end of the loop. We don't want to compile the code inside the loop again.
+				// This will be incremented by 1 due to the pc++ at the end of this loop.
+				pc = dest;
+
+				if (func->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
+			case ShaderOpcodes::END:
+				decompiledShader += "return true;\n";
+				finished = true;
+				return;
+
+			case ShaderOpcodes::NOP: break;
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	}

 	pc++;
 }

-
 bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 	const u32 opcode = instruction >> 26;
 	switch (opcode) {
@ -339,16 +776,57 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 		case ShaderOpcodes::SLT:
 		case ShaderOpcodes::SLTI:
 		case ShaderOpcodes::SGE:
-		case ShaderOpcodes::SGEI: return true;
+		case ShaderOpcodes::SGEI:
+		case ShaderOpcodes::LITP: return true;

 		default: return false;
 	}
 }

-void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
+void ShaderDecompiler::callFunction(const Function& function) {
+	switch (function.exitMode) {
+		// This function always ends, so call it and return true to signal that we're gonna be ending the shader
+		case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
+		// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
+		case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
+		// This function will not end. Just call it like a normal function.
+		default: decompiledShader += function.getCallStatement() + ";\n"; break;
+	}
+}

 std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
 	ShaderDecompiler decompiler(shader, config, entrypoint, api, language);

 	return decompiler.decompile();
 }
+
+const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
+	static constexpr std::array<const char*, 16> conditions = {
+		// ref(Y, X) = (0, 0)
+		"!all(cmp_reg)",
+		"all(not(cmp_reg))",
+		"!cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (0, 1)
+		"cmp_reg.x || !cmp_reg.y",
+		"cmp_reg.x && !cmp_reg.y",
+		"cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (1, 0)
+		"!cmp_reg.x || cmp_reg.y",
+		"!cmp_reg.x && cmp_reg.y",
+		"!cmp_reg.x",
+		"cmp_reg.y",
+
+		// ref(Y, X) = (1, 1)
+		"any(cmp_reg)",
+		"all(cmp_reg)",
+		"cmp_reg.x",
+		"cmp_reg.y",
+	};
+	const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
+
+	return conditions[key];
+}
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@ -1,3 +1,7 @@
+#include <fmt/format.h>
+
+#include <utility>
+
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
@ -702,6 +706,113 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }

+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
+	// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
+	std::array<std::pair<int, int>, 32> outputMappings{};
+	// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
+	std::array<u8, 16> vsOutputRegisters;
+
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+
+		for (int i = 0; i < vertConfig.outputCount; i++) {
+			const u32 config = vertConfig.outmaps[i];
+			for (int j = 0; j < 4; j++) {
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
+			}
+		}
+	}
+
+	auto getSemanticName = [&](u32 semanticIndex) {
+		auto [reg, lane] = outputMappings[semanticIndex];
+		return fmt::format("out_regs[{}][{}]", reg, lane);
+	};
+
+	std::string semantics = fmt::format(
+		R"(
+	vec4 a_coords = vec4({}, {}, {}, {});
+	vec4 a_quaternion = vec4({}, {}, {}, {});
+	vec4 a_vertexColour = vec4({}, {}, {}, {});
+	vec2 a_texcoord0 = vec2({}, {});
+	float a_texcoord0_w = {};
+	vec2 a_texcoord1 = vec2({}, {});
+	vec2 a_texcoord2 = vec2({}, {});
+	vec3 a_view = vec3({}, {}, {});
+)",
+		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
+		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
+		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
+		getSemanticName(18), getSemanticName(19), getSemanticName(20)
+	);
+
+	if (usingUbershader) {
+		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
+		return picaSource;
+	} else {
+		// TODO: Uniforms and don't hardcode fixed-function semantic indices...
+		std::string ret = picaSource;
+		if (api == API::GLES) {
+			ret += "\n#define USING_GLES\n";
+		}
+
+		ret += uniformDefinition;
+
+		ret += R"(
+out vec4 v_quaternion;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+
+#ifndef USING_GLES
+	out float gl_ClipDistance[2];
+#endif
+
+void main() {
+	pica_shader_main();
+)";
+	// Transfer fixed function fragment registers from vertex shader output to the fragment shader
+	ret += semantics;
+	
+	ret += R"(
+	gl_Position = a_coords;
+	vec4 colourAbs = abs(a_vertexColour);
+	v_colour = min(colourAbs, vec4(1.f));
+
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+	v_quaternion = a_quaternion;
+
+#ifndef USING_GLES
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipCoords, a_coords);
+#endif
+})";
+		return ret;
+	}
+}
+
 void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
 	if (api != API::GLES) [[unlikely]] {
 		Helpers::warn("Shadergen: Unsupported API for compileLogicOps");
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@ -34,4 +34,5 @@ void PICAShader::reset() {

 	codeHashDirty = true;
 	opdescHashDirty = true;
+	uniformsDirty = true;
 }
--- a/src/core/renderer_gl/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
 }

 void GLStateManager::resetBuffers() {
-	boundVBO = 0;
 	boundUBO = 0;
-
-	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 }

--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -2,13 +2,15 @@

 #include <stb_image_write.h>

+#include <bit>
 #include <cmrc/cmrc.hpp>

-#include "config.hpp"
 #include "PICA/float_types.hpp"
-#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
+#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader_decompiler.hpp"
+#include "config.hpp"
 #include "math_util.hpp"

 CMRC_DECLARE(RendererGL);
@ -24,7 +26,7 @@ void RendererGL::reset() {
 	colourBufferCache.reset();
 	textureCache.reset();

-	clearShaderCache();
+	shaderCache.clear();

 	// Init the colour/depth buffer settings to some random defaults on reset
 	colourBufferLoc = 0;
@ -77,40 +79,56 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.useProgram(displayProgram);
 	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object

+	// Create stream buffers for vertex, index and uniform buffers
+	static constexpr usize hwIndexBufferSize = 2_MB;
+	static constexpr usize hwVertexBufferSize = 16_MB;
+
+	hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
+	hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
+
 	// Allocate memory for the shadergen fragment uniform UBO
 	glGenBuffers(1, &shadergenFragmentUBO);
 	gl.bindUBO(shadergenFragmentUBO);
 	glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);

-	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
-	gl.bindVBO(vbo);
-	vao.create();
-	gl.bindVAO(vao);
+	// Allocate memory for the accelerated vertex shader uniform UBO
+	glGenBuffers(1, &hwShaderUniformUBO);
+	gl.bindUBO(hwShaderUniformUBO);
+	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
+
+	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
+	vbo.bind();
+	// Initialize the VAO used when not using hw shaders
+	defaultVAO.create();
+	gl.bindVAO(defaultVAO);

 	// Position (x, y, z, w) attributes
-	vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
-	vao.enableAttribute(0);
+	defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
+	defaultVAO.enableAttribute(0);
 	// Quaternion attribute
-	vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
-	vao.enableAttribute(1);
+	defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
+	defaultVAO.enableAttribute(1);
 	// Colour attribute
-	vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
-	vao.enableAttribute(2);
+	defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
+	defaultVAO.enableAttribute(2);
 	// UV 0 attribute
-	vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
-	vao.enableAttribute(3);
+	defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
+	defaultVAO.enableAttribute(3);
 	// UV 1 attribute
-	vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
-	vao.enableAttribute(4);
+	defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
+	defaultVAO.enableAttribute(4);
 	// UV 0 W-component attribute
-	vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
-	vao.enableAttribute(5);
+	defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
+	defaultVAO.enableAttribute(5);
 	// View
-	vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
-	vao.enableAttribute(6);
+	defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
+	defaultVAO.enableAttribute(6);
 	// UV 2 attribute
-	vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
-	vao.enableAttribute(7);
+	defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
+	defaultVAO.enableAttribute(7);
+
+	// Initialize the VAO used for hw shaders
+	hwShaderVAO.create();

 	dummyVBO.create();
 	dummyVAO.create();
@ -165,6 +183,12 @@ void RendererGL::initGraphicsContextInternal() {
 	OpenGL::clearColor();
 	OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);

+	// Initialize fixed attributes
+	for (int i = 0; i < fixedAttrValues.size(); i++) {
+		fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
+		glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
+	}
+
 	reset();

 	// Populate our driver info structure
@ -418,29 +442,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};

-	bool usingUbershader = enableUbershader;
-	if (usingUbershader) {
-		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
-		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
-
-		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
-		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
-		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
-			usingUbershader = false;
-		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
-	} else {
-		OpenGL::Program& program = getSpecializedShader();
-		gl.useProgram(program);
-	}
-
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	gl.bindVBO(vbo);
-	gl.bindVAO(vao);
+
+	// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
+	if (!usingAcceleratedShader) {
+		vbo.bind();
+		gl.bindVAO(defaultVAO);
+	}

 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@ -458,38 +467,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const int depthFunc = getBits<4, 3>(depthControl);
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
-
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};

-	// Update ubershader uniforms
-	if (usingUbershader) {
-		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
-		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
-		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
-
-		if (oldDepthScale != depthScale) {
-			oldDepthScale = depthScale;
-			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
-		}
-
-		if (oldDepthOffset != depthOffset) {
-			oldDepthOffset = depthOffset;
-			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
-		}
-
-		if (oldDepthmapEnable != depthMapEnable) {
-			oldDepthmapEnable = depthMapEnable;
-			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
-		}
-
-		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
-		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
-		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
-		setupUbershaderTexEnv();
-	}
-
 	bindTexturesToSlots();
-
 	if (gpu.fogLUTDirty) {
 		updateFogLUT();
 	}
@ -532,8 +512,22 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v

 	setupStencilTest(stencilEnable);

-	vbo.bufferVertsSub(vertices);
-	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	if (!usingAcceleratedShader) {
+		vbo.bufferVertsSub(vertices);
+		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+	} else {
+		if (performIndexedRender) {
+			// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
+			hwIndexBuffer->Bind();
+			glDrawRangeElementsBaseVertex(
+				primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
+				hwIndexBufferOffset, -GLint(minimumIndex)
+			);
+		} else {
+			// When doing non-indexed rendering, just use glDrawArrays
+			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+		}
+	}
 }

 void RendererGL::display() {
@ -840,7 +834,8 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
 }

 OpenGL::Program& RendererGL::getSpecializedShader() {
-	constexpr uint uboBlockBinding = 2;
+	constexpr uint vsUBOBlockBinding = 1;
+	constexpr uint fsUBOBlockBinding = 2;

 	PICA::FragmentConfig fsConfig(regs);
 	// If we're not on GLES, ignore the logic op configuration and don't generate redundant shaders for it, since we use hw logic ops
@ -848,30 +843,44 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	fsConfig.outConfig.logicOpMode = PICA::LogicOpMode(0);
 #endif

-	CachedProgram& programEntry = shaderCache[fsConfig];
+	OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
+	if (!fragShader.exists()) {
+		std::string fs = fragShaderGen.generate(fsConfig);
+		fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
+	}
+
+	// Get the handle of the current vertex shader
+	OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
+	// And form the key for looking up a shader program
+	const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
+
+	CachedProgram& programEntry = shaderCache.programCache[programKey];
 	OpenGL::Program& program = programEntry.program;

 	if (!program.exists()) {
-		std::string fs = fragShaderGen.generate(fsConfig, &driverInfo);
-
-		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
-		program.create({defaultShadergenVs, fragShader});
+		program.create({vertexShader, fragShader});
 		gl.useProgram(program);

-		fragShader.free();
-
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);

-		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
+		// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
-		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
-		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
+		glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
+
+		if (usingAcceleratedShader) {
+			uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
+			glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
+		}
+	}
+	glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
+	if (usingAcceleratedShader) {
+		glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
 	}
-	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);

 	// Upload uniform data to our shader's UBO
 	PICA::FragmentUniforms uniforms;
@ -961,6 +970,101 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }

+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	// First we figure out if we will be using an ubershader
+	bool usingUbershader = emulatorConfig->useUbershaders;
+	if (usingUbershader) {
+		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
+		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
+
+		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
+		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
+		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+			usingUbershader = false;
+		}
+	}
+
+	// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
+	// TODO: Ubershader support for accelerated shaders
+	usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
+
+	if (usingAcceleratedShader) {
+		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
+
+		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
+		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
+		if (!shader.has_value()) {
+			// Initialize shader to a "null" shader (handle == 0)
+			shader = OpenGL::Shader();
+
+			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
+				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
+				Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+			);
+
+			// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
+			// it to the GPU
+			if (!picaShaderSource.empty()) {
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
+				shader->create({vertexShaderSource}, OpenGL::Vertex);
+			}
+		}
+
+		// Shader generation did not work out, so set usingAcceleratedShader to false
+		if (!shader->exists()) {
+			usingAcceleratedShader = false;
+		} else {
+			generatedVertexShader = &(*shader);
+			gl.bindUBO(hwShaderUniformUBO);
+
+			if (shaderUnit.vs.uniformsDirty) {
+				shaderUnit.vs.uniformsDirty = false;
+				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
+			}
+
+			performIndexedRender = accel->indexed;
+			minimumIndex = GLsizei(accel->minimumIndex);
+			maximumIndex = GLsizei(accel->maximumIndex);
+
+			// Upload vertex data and index buffer data to our GPU
+			accelerateVertexUpload(shaderUnit, accel);
+		}
+	}
+
+	if (!usingUbershader) {
+		OpenGL::Program& program = getSpecializedShader();
+		gl.useProgram(program);
+	} else { // Bind ubershader & load ubershader uniforms
+		gl.useProgram(triangleProgram);
+
+		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
+		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
+		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
+
+		if (oldDepthScale != depthScale) {
+			oldDepthScale = depthScale;
+			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
+		}
+
+		if (oldDepthOffset != depthOffset) {
+			oldDepthOffset = depthOffset;
+			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
+		}
+
+		if (oldDepthmapEnable != depthMapEnable) {
+			oldDepthmapEnable = depthMapEnable;
+			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
+		}
+
+		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
+		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
+		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
+		setupUbershaderTexEnv();
+	}
+
+	return usingAcceleratedShader;
+}
+
 void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;
@ -974,7 +1078,7 @@ void RendererGL::screenshot(const std::string& name) {

 	// Flip the image vertically
 	for (int y = 0; y < height; y++) {
-		memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
+		std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
 		// Swap R and B channels
 		for (int x = 0; x < width; x++) {
 			std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
@ -986,21 +1090,12 @@ void RendererGL::screenshot(const std::string& name) {
 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
 }

-void RendererGL::clearShaderCache() {
-	for (auto& shader : shaderCache) {
-		CachedProgram& cachedProgram = shader.second;
-		cachedProgram.program.free();
-	}
-
-	shaderCache.clear();
-}
-
 void RendererGL::deinitGraphicsContext() {
 	// Invalidate all surface caches since they'll no longer be valid
 	textureCache.reset();
 	depthBufferCache.reset();
 	colourBufferCache.reset();
-	clearShaderCache();
+	shaderCache.clear();

 	// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
 	// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@ -1049,3 +1144,92 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
 }
+
+void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	u32 buffer = 0;  // Vertex buffer index for non-fixed attributes
+	u32 attrCount = 0;
+
+	const u32 totalAttribCount = accel->totalAttribCount;
+
+	static constexpr GLenum attributeFormats[4] = {
+		GL_BYTE,           // 0: Signed byte
+		GL_UNSIGNED_BYTE,  // 1: Unsigned byte
+		GL_SHORT,          // 2: Short
+		GL_FLOAT,          // 3: Float
+	};
+
+	const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
+
+	// Update index buffer if necessary
+	if (accel->indexed) {
+		usingShortIndices = accel->useShortIndices;
+		const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
+
+		hwIndexBuffer->Bind();
+		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
+		hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
+
+		std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
+		hwIndexBuffer->Unmap(indexBufferSize);
+	}
+
+	hwVertexBuffer->Bind();
+	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
+
+	gl.bindVAO(hwShaderVAO);
+
+	// Enable or disable vertex attributes as needed
+	const u32 currentAttributeMask = accel->enabledAttributeMask;
+	// Use bitwise xor to calculate which attributes changed
+	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
+	
+	while (attributeMaskDiff != 0) {
+		// Get index of next different attribute and turn it off
+		const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
+		const u32 mask = 1u << index;
+		attributeMaskDiff ^= mask;
+
+		if ((currentAttributeMask & mask) != 0) {
+			// Attribute was disabled and is now enabled
+			hwShaderVAO.enableAttribute(index);
+		} else {
+			// Attribute was enabled and is now disabled
+			hwShaderVAO.disableAttribute(index);
+		}
+	}
+
+	previousAttributeMask = currentAttributeMask;
+
+	// Upload the data for each (enabled) attribute loader into our vertex buffer
+	for (int i = 0; i < accel->totalLoaderCount; i++) {
+		auto& loader = accel->loaders[i];
+
+		std::memcpy(vertexData, loader.data, loader.size);
+		vertexData += loader.size;
+	}
+
+	hwVertexBuffer->Unmap(accel->vertexDataSize);
+
+	// Iterate over the 16 PICA input registers and configure how they should be fetched.
+	for (int i = 0; i < 16; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		const u32 attributeMask = 1u << i;
+
+		if (accel->fixedAttributes & attributeMask) {
+			auto& attrValue = fixedAttrValues[i];
+			// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
+			if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
+				attrValue[3] != attrib.fixedValue[3]) {
+				std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
+				glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			}
+		} else if (accel->enabledAttributeMask & attributeMask) {
+			glVertexAttribPointer(
+				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
+			);
+		}
+	}
+}
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@ -163,8 +163,9 @@ static int fetchVariableRange(std::string key, int min, int max) {

 static void configInit() {
 	static const retro_variable values[] = {
-		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled"
-																	  : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled" : "Enable shader JIT; disabled|enabled"},
+		{"panda3ds_accelerate_shaders",
+		 EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
 		{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
 		{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
 																	  : "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
@ -197,6 +198,8 @@ static void configUpdate() {
 	config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
 	config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
+	config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
+
 	config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);
 	config.discordRpcEnabled = false;