Moar shader decompiler (#559)

* Renderer: Add prepareForDraw callback * Add fmt submodule and port shader decompiler instructions to it * Add shader acceleration setting * Hook up vertex shaders to shader cache * Shader decompiler: Fix redundant compilations * Shader Decompiler: Fix vertex attribute upload * Shader compiler: Simplify generated code for reading and faster compilation * Further simplify shader decompiler output * Shader decompiler: More smallen-ing * Shader decompiler: Get PICA uniforms uploaded to the GPU * Shader decompiler: Readd clipping * Shader decompiler: Actually `break` on control flow instructions * Shader decompiler: More control flow handling * Shader decompiler: Fix desitnation mask * Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) * Disgusting changes to handle the fact that hw shader shaders are 2x as big * Shader decompiler: Implement proper output semantic mapping * Moar instructions * Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI * Shader decompiler: Add register indexing * Shader decompiler: Optimize mova with both x and y masked * Shader decompiler: Add DPH/DPHI * Fix shader caching being broken * PICA decompiler: Cache VS uniforms * Simply vertex cache code * Simplify vertex cache code * Shader decompiler: Add loops * Shader decompiler: Implement safe multiplication * Shader decompiler: Implement LG2/EX2 * Shader decompiler: More control flow * Shader decompiler: Fix JMPU condition * Shader decompiler: Convert main function to void * PICA: Start implementing GPU vertex fetch * More hw VAO work * More hw VAO work * More GPU vertex fetch code * Add GL Stream Buffer from Duckstation * GL: Actually upload data to stream buffers * GPU: Cleanup immediate mode handling * Get first renders working with accelerated draws * Shader decompiler: Fix control flow analysis bugs * HW shaders: Accelerate indexed draws * Shader decompiler: Add support for compilation errors * GLSL decompiler: Fall back for LITP * Add Renderdoc scope classes * Fix control flow analysis bug * HW shaders: Fix attribute fetch * Rewriting hw vertex fetch * Stream buffer: Fix copy-paste mistake * HW shaders: Fix indexed rendering * HW shaders: Add padding attributes * HW shaders: Avoid redundant glVertexAttrib4f calls * HW shaders: Fix loops * HW shaders: Make generated shaders slightly smaller * Fix libretro build * HW shaders: Fix android * Remove redundant ubershader checks * Set accelerate shader default to true * Shader decompiler: Don't declare VS input attributes as an array * Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again * fix merge conflict bug
2025-07-18 21:11:44 +12:00 · 2024-10-19 16:53:51 +03:00 · 2024-10-19 16:53:51 +03:00 · 49a94a13c5
commit 49a94a13c5
parent afaf18f124
34 changed files with 1877 additions and 253 deletions
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@ -0,0 +1,45 @@
+#pragma once
+
+#include <array>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+			u32 stride;
+
+			u8 type;
+			u8 componentCount;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
+		u8* indexBuffer;
+
+		// Minimum and maximum index in the index buffer for a draw call
+		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
+		u32 totalLoaderCount;
+		u32 enabledAttributeMask;
+		u32 fixedAttributes;
+		u32 vertexDataSize;
+
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;
+
+		bool canBeAccelerated;
+		bool indexed;
+		bool useShortIndices;
+	};
+}  // namespace PICA
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@ -1,6 +1,7 @@
 #pragma once
 #include <array>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/dynapica/shader_rec.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_vertex.hpp"
@ -13,6 +14,12 @@
 #include "memory.hpp"
 #include "renderer.hpp"

+enum class ShaderExecMode {
+	Interpreter,  // Interpret shaders on the CPU
+	JIT,          // Recompile shaders to CPU machine code
+	Hardware,     // Recompiler shaders to host shaders and run them on the GPU
+};
+
 class GPU {
 	static constexpr u32 regNum = 0x300;
 	static constexpr u32 extRegNum = 0x1000;
@ -45,7 +52,7 @@ class GPU {
 	uint immediateModeVertIndex;
 	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading

-	template <bool indexed, bool useShaderJIT>
+	template <bool indexed, ShaderExecMode mode>
 	void drawArrays();

 	// Silly method of avoiding linking problems. TODO: Change to something less silly
@ -81,6 +88,7 @@ class GPU {
 	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();

+	void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
  public:
 	// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
 	// Encoded in PICA native format
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@ -0,0 +1,57 @@
+#pragma once
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	// Configuration struct used
+	struct VertConfig {
+		PICAHash::HashType shaderHash;
+		PICAHash::HashType opdescHash;
+		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
+		bool usingUbershader;
+
+		// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
+		// As the padding will get hashed and memcmp'd...
+		u32 pad{};
+
+		bool operator==(const VertConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
+		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				// Mask out unused bits
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
+			}
+		}
+	};
+}  // namespace PICA
+
+static_assert(sizeof(PICA::VertConfig) == 56);
+
+// Override std::hash for our vertex config class
+template <>
+struct std::hash<PICA::VertConfig> {
+	std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@ -107,6 +107,11 @@ class PICAShader {
 	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
 	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
+	
+	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
+	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
+	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
+	using Hash = PICAHash::HashType;

  protected:
 	std::array<u32, 128> operandDescriptors;
@ -125,14 +130,13 @@ class PICAShader {
 	std::array<CallInfo, 4> callInfo;
 	ShaderType type;

-	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
-	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
-	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
-	using Hash = PICAHash::HashType;
-
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)

+  public:
+	bool uniformsDirty = false;
+
+  protected:
 	bool codeHashDirty = false;
 	bool opdescHashDirty = false;

@ -284,6 +288,7 @@ class PICAShader {
 				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
 				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
+			uniformsDirty = true;
 		}
 	}

@ -295,6 +300,12 @@ class PICAShader {
 		u[1] = getBits<8, 8>(word);
 		u[2] = getBits<16, 8>(word);
 		u[3] = getBits<24, 8>(word);
+		uniformsDirty = true;
+	}
+
+	void uploadBoolUniform(u32 value) {
+		boolUniform = value;
+		uniformsDirty = true;
 	}

 	void run();
@ -302,6 +313,10 @@ class PICAShader {

 	Hash getCodeHash();
 	Hash getOpdescHash();
+
+	// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
+	static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
+	void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
 };

 static_assert(
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@ -1,8 +1,11 @@
 #pragma once
+#include <fmt/format.h>
+
+#include <map>
 #include <set>
 #include <string>
 #include <tuple>
-#include <map>
+#include <utility>
 #include <vector>

 #include "PICA/shader.hpp"
@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
 			explicit Function(u32 start, u32 end) : start(start), end(end) {}
 			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }

-			std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
-			std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
-			std::string getCallStatement() const { return getIdentifier() + "()"; }
+			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
+			// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
+			// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
+			// from within functions deep in the callstack
+			std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
+			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
 		};

 		std::set<Function> functions{};
@ -93,9 +99,11 @@ namespace PICA::ShaderGen {

 		API api;
 		Language language;
+		bool compilationError = false;

 		void compileInstruction(u32& pc, bool& finished);
-		void compileRange(const AddressRange& range);
+		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
+		std::pair<u32, bool> compileRange(const AddressRange& range);
 		void callFunction(const Function& function);
 		const Function* findFunction(const AddressRange& range);

@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
 		std::string getDest(u32 dest) const;
 		std::string getSwizzlePattern(u32 swizzle) const;
 		std::string getDestSwizzle(u32 destinationMask) const;
+		const char* getCondition(u32 cond, u32 refX, u32 refY);

 		void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
 		// Returns if the instruction uses the typical register encodings most instructions use
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@ -3,6 +3,7 @@

 #include "PICA/gpu.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen_types.hpp"
 #include "helpers.hpp"
@ -31,6 +32,8 @@ namespace PICA::ShaderGen {
 		FragmentGenerator(API api, Language language) : api(api), language(language) {}
 		std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
 		std::string getDefaultVertexShader();
+		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);

 		void setTarget(API api, Language language) {
 			this->api = api;
--- a/include/PICA/shader_unit.hpp
+++ b/include/PICA/shader_unit.hpp
@ -2,10 +2,9 @@
 #include "PICA/shader.hpp"

 class ShaderUnit {
-
-public:
-	PICAShader vs; // Vertex shader
-	PICAShader gs; // Geometry shader
+  public:
+	PICAShader vs;  // Vertex shader
+	PICAShader gs;  // Geometry shader

 	ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
 	void reset();
--- a/include/align.hpp
+++ b/include/align.hpp
@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _MSC_VER
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
--- a/include/config.hpp
+++ b/include/config.hpp
@ -20,11 +20,13 @@ struct EmulatorConfig {
 #else
 	static constexpr bool ubershaderDefault = true;
 #endif
-
+	static constexpr bool accelerateShadersDefault = true;
+	
 	bool shaderJitEnabled = shaderJitDefault;
-	bool discordRpcEnabled = false;
 	bool useUbershaders = ubershaderDefault;
+	bool accelerateShaders = accelerateShadersDefault;
 	bool accurateShaderMul = false;
+	bool discordRpcEnabled = false;

 	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
 	bool forceShadergenForLights = true;
--- a/include/renderdoc.hpp
+++ b/include/renderdoc.hpp
@ -35,4 +35,35 @@ namespace Renderdoc {
 	static void setOutputDir(const std::string& path, const std::string& prefix) {}
 	static constexpr bool isSupported() { return false; }
 }  // namespace Renderdoc
-#endif
+#endif
+
+namespace Renderdoc {
+	// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
+	struct Scope {
+		Scope() { Renderdoc::startCapture(); }
+		~Scope() { Renderdoc::endCapture(); }
+
+		Scope(const Scope&) = delete;
+		Scope& operator=(const Scope&) = delete;
+
+		Scope(Scope&&) = delete;
+		Scope& operator=(const Scope&&) = delete;
+	};
+
+	// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
+	// trigger on its own and take a capture
+	struct InstantScope {
+		InstantScope() {
+			Renderdoc::triggerCapture();
+			Renderdoc::startCapture();
+		}
+
+		~InstantScope() { Renderdoc::endCapture(); }
+		
+		InstantScope(const InstantScope&) = delete;
+		InstantScope& operator=(const InstantScope&) = delete;
+
+		InstantScope(InstantScope&&) = delete;
+		InstantScope& operator=(const InstantScope&&) = delete;
+	};
+}  // namespace Renderdoc
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@ -1,9 +1,10 @@
 #pragma once
 #include <array>
+#include <optional>
 #include <span>
 #include <string>
-#include <optional>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "helpers.hpp"
@ -21,9 +22,11 @@ enum class RendererType : s8 {
 };

 struct EmulatorConfig;
-class GPU;
 struct SDL_Window;

+class GPU;
+class ShaderUnit;
+
 class Renderer {
  protected:
 	GPU& gpu;
@ -77,7 +80,11 @@ class Renderer {
 	virtual std::string getUbershader() { return ""; }
 	virtual void setUbershader(const std::string& shader) {}

-	virtual void setUbershaderSetting(bool value) {}
+	// This function is called on every draw call before parsing vertex data.
+	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
+	// ubershaders and shadergen, and so on.
+	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }

 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
--- a/include/renderer_gl/gl_state.hpp
+++ b/include/renderer_gl/gl_state.hpp
@ -38,7 +38,6 @@ struct GLStateManager {
 	
 	GLuint stencilMask;
 	GLuint boundVAO;
-	GLuint boundVBO;
 	GLuint currentProgram;
 	GLuint boundUBO;

@ -173,13 +172,6 @@ struct GLStateManager {
 		}
 	}

-	void bindVBO(GLuint handle) {
-		if (boundVBO != handle) {
-			boundVBO = handle;
-			glBindBuffer(GL_ARRAY_BUFFER, handle);
-		}
-	}
-
 	void useProgram(GLuint handle) {
 		if (currentProgram != handle) {
 			currentProgram = handle;
@ -195,7 +187,6 @@ struct GLStateManager {
 	}

 	void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
-	void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
 	void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }

 	void setColourMask(bool r, bool g, bool b, bool a) {
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -3,15 +3,20 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <memory>
+#include <optional>
 #include <span>
 #include <unordered_map>
+#include <utility>

 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
 #include "gl_driver.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
@ -29,9 +34,11 @@ class RendererGL final : public Renderer {
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;

-	OpenGL::VertexArray vao;
+	// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
+	OpenGL::VertexArray defaultVAO;
+	// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
+	OpenGL::VertexArray hwShaderVAO;
 	OpenGL::VertexBuffer vbo;
-	bool enableUbershader = true;

 	// Data 
 	struct {
@ -54,6 +61,21 @@ class RendererGL final : public Renderer {
 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
+	// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
+	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
+	bool usingShortIndices = false;
+
+	// Set by prepareForDraw, metadata for indexed renders
+	GLuint minimumIndex = 0;
+	GLuint maximumIndex = 0;
+	void* hwIndexBufferOffset = nullptr;
+
+	// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
+	u32 previousAttributeMask = 0;
+
+	// Cached pointer to the current vertex shader when using HW accelerated shaders
+	OpenGL::Shader* generatedVertexShader = nullptr;

 	SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
 	SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -71,12 +93,51 @@ class RendererGL final : public Renderer {
 	// We can compile this once and then link it with all other generated fragment shaders
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;
+	// UBO for uploading the PICA uniforms when using hw shaders
+	GLuint hwShaderUniformUBO;
+
+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
+	// Cache of fixed attribute values so that we don't do any duplicate updates
+	std::array<std::array<float, 4>, 16> fixedAttrValues;

 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
 	};
-	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
+
+	struct ShaderCache {
+		std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
+		std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
+
+		// Program cache indexed by GLuints for the vertex and fragment shader to use
+		// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
+		std::unordered_map<u64, CachedProgram> programCache;
+
+		void clear() {
+			for (auto& it : programCache) {
+				CachedProgram& cachedProgram = it.second;
+				cachedProgram.program.free();
+			}
+
+			for (auto& it : vertexShaderCache) {
+				if (it.second.has_value()) {
+					it.second->free();
+				}
+			}
+
+			for (auto& it : fragmentShaderCache) {
+				it.second.free();
+			}
+
+			programCache.clear();
+			vertexShaderCache.clear();
+			fragmentShaderCache.clear();
+		}
+	};
+	ShaderCache shaderCache;

 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
@ -95,6 +156,8 @@ class RendererGL final : public Renderer {
 	void updateFogLUT();
 	void initGraphicsContextInternal();

+	void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
+
  public:
 	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
 		: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@ -112,15 +175,13 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-
-	virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);

 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
 	void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
 	void resetStateManager() { gl.reset(); }
-	void clearShaderCache();
 	void initUbershader(OpenGL::Program& program);

 #ifdef PANDA3DS_FRONTEND_QT