Merge pull request #98 from Wunkolo/modular-gl

Allow conditional OpenGL rendering backend
2025-07-04 06:16:20 +12:00 · 2023-07-16 03:48:07 +03:00 · 2023-07-16 03:48:07 +03:00 · 786c3e8a5c
commit 786c3e8a5c
parent 8f91b99672 a601686cb1
18 changed files with 545 additions and 407 deletions
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -2,19 +2,28 @@

 #include <array>
 #include <bitset>
-#include <cstdio>
 #include <cstddef>
+#include <cstdio>

 #include "PICA/float_types.hpp"
 #include "PICA/regs.hpp"

+#ifdef PANDA3DS_ENABLE_OPENGL
+#include "renderer_gl/renderer_gl.hpp"
+#endif
+
 using namespace Floats;

 // Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it
 // Thus, our GLStateManager being here does not negatively impact renderer-agnosticness
-GPU::GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config) : mem(mem), renderer(*this, gl, regs), config(config) {
+GPU::GPU(Memory& mem, EmulatorConfig& config) : mem(mem), config(config) {
 	vram = new u8[vramSize];
-	mem.setVRAM(vram); // Give the bus a pointer to our VRAM
+	mem.setVRAM(vram);  // Give the bus a pointer to our VRAM
+
+	// TODO: Configurable backend
+#ifdef PANDA3DS_ENABLE_OPENGL
+	renderer.reset(new RendererGL(*this, regs));
+#endif
 }

 void GPU::reset() {
@ -41,7 +50,7 @@ void GPU::reset() {
 		e.config2 = 0;
 	}

-	renderer.reset();
+	renderer->reset();
 }

 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
@ -73,15 +82,14 @@ void GPU::drawArrays() {
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
-	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer

 	// Configures the type of primitive and the number of vertex shader outputs
 	const u32 primConfig = regs[PICA::InternalRegs::PrimitiveConfig];
 	const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(primConfig));
 	if (vertexCount > Renderer::vertexBufferSize) Helpers::panic("[PICA] vertexCount > vertexBufferSize");

-	if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) ||
-		(primType == PICA::PrimType::TriangleStrip && vertexCount < 3) ||
+	if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) || (primType == PICA::PrimType::TriangleStrip && vertexCount < 3) ||
 		(primType == PICA::PrimType::TriangleFan && vertexCount < 3)) {
 		Helpers::panic("Invalid vertex count for primitive. Type: %d, vert count: %d\n", primType, vertexCount);
 	}
@ -89,10 +97,10 @@ void GPU::drawArrays() {
 	// Get the configuration for the index buffer, used only for indexed drawing
 	u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
 	u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
-	bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit
+	bool shortIndex = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit

 	// Stuff the global attribute config registers in one u64 to make attr parsing easier
-	// TODO: Cache this when the vertex attribute format registers are written to 
+	// TODO: Cache this when the vertex attribute format registers are written to
 	u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);

 	if constexpr (!indexed) {
@ -111,24 +119,24 @@ void GPU::drawArrays() {
 	constexpr size_t vertexCacheSize = 64;

 	struct {
-		std::bitset<vertexCacheSize> validBits{0};           // Shows which tags are valid. If the corresponding bit is 1, then there's an entry
-		std::array<u32, vertexCacheSize> ids;                // IDs (ie indices of the cached vertices in the 3DS vertex buffer)
-		std::array<u32, vertexCacheSize> bufferPositions;    // Positions of the cached vertices in our own vertex buffer
+		std::bitset<vertexCacheSize> validBits{0};         // Shows which tags are valid. If the corresponding bit is 1, then there's an entry
+		std::array<u32, vertexCacheSize> ids;              // IDs (ie indices of the cached vertices in the 3DS vertex buffer)
+		std::array<u32, vertexCacheSize> bufferPositions;  // Positions of the cached vertices in our own vertex buffer
 	} vertexCache;
-		
+
 	for (u32 i = 0; i < vertexCount; i++) {
-		u32 vertexIndex; // Index of the vertex in the VBO for indexed rendering
+		u32 vertexIndex;  // Index of the vertex in the VBO for indexed rendering

 		if constexpr (!indexed) {
 			vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg];
 		} else {
 			if (shortIndex) {
 				auto ptr = getPointerPhys<u16>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is very unsafe
+				vertexIndex = *ptr;  // TODO: This is very unsafe
 				indexBufferPointer += 2;
 			} else {
 				auto ptr = getPointerPhys<u8>(indexBufferPointer);
-				vertexIndex = *ptr; // TODO: This is also very unsafe
+				vertexIndex = *ptr;  // TODO: This is also very unsafe
 				indexBufferPointer += 1;
 			}
 		}
@ -152,22 +160,22 @@ void GPU::drawArrays() {
 		}

 		int attrCount = 0;
-		int buffer = 0; // Vertex buffer index for non-fixed attributes
+		int buffer = 0;  // Vertex buffer index for non-fixed attributes

 		while (attrCount < totalAttribCount) {
 			// Check if attribute is fixed or not
-			if (fixedAttribMask & (1 << attrCount)) { // Fixed attribute
-				vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; // TODO: Is this how it works?
+			if (fixedAttribMask & (1 << attrCount)) {                         // Fixed attribute
+				vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];  // TODO: Is this how it works?
 				vec4f& inputAttr = currentAttributes[attrCount];
-				std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f)); // Copy fixed attr to input attr
+				std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f));  // Copy fixed attr to input attr
 				attrCount++;
-			} else { // Non-fixed attribute
-				auto& attr = attributeInfo[buffer]; // Get information for this attribute
-				u64 attrCfg = attr.getConfigFull(); // Get config1 | (config2 << 32)
+			} else {                                 // Non-fixed attribute
+				auto& attr = attributeInfo[buffer];  // Get information for this attribute
+				u64 attrCfg = attr.getConfigFull();  // Get config1 | (config2 << 32)
 				u32 attrAddress = vertexBase + attr.offset + (vertexIndex * attr.size);

 				for (int j = 0; j < attr.componentCount; j++) {
-					uint index = (attrCfg >> (j * 4)) & 0xf; // Get index of attribute in vertexCfg
+					uint index = (attrCfg >> (j * 4)) & 0xf;  // Get index of attribute in vertexCfg

 					// Vertex attributes used as padding
 					// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
@ -179,15 +187,15 @@ void GPU::drawArrays() {
 					}

 					u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
-					u32 attribType = attribInfo & 0x3; //  Type of attribute(sbyte/ubyte/short/float)
-					u32 size = (attribInfo >> 2) + 1; // Total number of components
+					u32 attribType = attribInfo & 0x3;  //  Type of attribute(sbyte/ubyte/short/float)
+					u32 size = (attribInfo >> 2) + 1;   // Total number of components

-					//printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size);
+					// printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size);
 					vec4f& attribute = currentAttributes[attrCount];
-					uint component; // Current component
+					uint component;  // Current component

 					switch (attribType) {
-						case 0: { // Signed byte
+						case 0: {  // Signed byte
 							s8* ptr = getPointerPhys<s8>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@ -197,7 +205,7 @@ void GPU::drawArrays() {
 							break;
 						}

-						case 1: { // Unsigned byte
+						case 1: {  // Unsigned byte
 							u8* ptr = getPointerPhys<u8>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@ -207,7 +215,7 @@ void GPU::drawArrays() {
 							break;
 						}

-						case 2: { // Short
+						case 2: {  // Short
 							s16* ptr = getPointerPhys<s16>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = static_cast<float>(*ptr++);
@ -217,7 +225,7 @@ void GPU::drawArrays() {
 							break;
 						}

-						case 3: { // Float
+						case 3: {  // Float
 							float* ptr = getPointerPhys<float>(attrAddress);
 							for (component = 0; component < size; component++) {
 								float val = *ptr++;
@ -251,8 +259,8 @@ void GPU::drawArrays() {
 			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 		}
-		
-        if constexpr (useShaderJIT) {
+
+		if constexpr (useShaderJIT) {
 			shaderJIT.run(shaderUnit.vs);
 		} else {
 			shaderUnit.vs.run();
@ -264,14 +272,14 @@ void GPU::drawArrays() {
 		for (int i = 0; i < totalShaderOutputs; i++) {
 			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];

-			for (int j = 0; j < 4; j++) { // pls unroll
+			for (int j = 0; j < 4; j++) {  // pls unroll
 				const u32 mapping = (config >> (j * 8)) & 0x1F;
 				out.raw[mapping] = shaderUnit.vs.outputs[i][j];
 			}
 		}
 	}

-	renderer.drawVertices(primType, std::span(vertices).first(vertexCount));
+	renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 }

 PICA::Vertex GPU::getImmediateModeVertex() {
@ -289,7 +297,9 @@ PICA::Vertex GPU::getImmediateModeVertex() {
 	std::memcpy(&v.s.colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
 	std::memcpy(&v.s.texcoord0, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));

-	printf("(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]);
+	printf(
+		"(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]
+	);
 	printf("(r, g, b, a) = (%f, %f, %f, %f)\n", (double)v.s.colour[0], (double)v.s.colour[1], (double)v.s.colour[2], (double)v.s.colour[3]);
 	printf("(u, v      ) = (%f, %f)\n", (double)v.s.texcoord0[0], (double)v.s.texcoord0[1]);

--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@ -1,11 +1,12 @@
-#include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"

+#include "PICA/gpu.hpp"
+
 using namespace Floats;
 using namespace Helpers;

 u32 GPU::readReg(u32 address) {
-	if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
+	if (address >= 0x1EF01000 && address < 0x1EF01C00) {  // Internal registers
 		const u32 index = (address - 0x1EF01000) / sizeof(u32);
 		return readInternalReg(index);
 	} else {
@ -15,7 +16,7 @@ u32 GPU::readReg(u32 address) {
 }

 void GPU::writeReg(u32 address, u32 value) {
-	if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers
+	if (address >= 0x1EF01000 && address < 0x1EF01C00) {  // Internal registers
 		const u32 index = (address - 0x1EF01000) / sizeof(u32);
 		writeInternalReg(index, value, 0xffffffff);
 	} else {
@ -59,7 +60,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 	}

 	u32 currentValue = regs[index];
-	u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask"
+	u32 newValue = (currentValue & ~mask) | (value & mask);  // Only overwrite the bits specified by "mask"
 	regs[index] = newValue;

 	// TODO: Figure out if things like the shader index use the unmasked value or the masked one
@ -74,38 +75,38 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			break;

 		case AttribFormatHigh:
-			totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes
-			fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices
+			totalAttribCount = (value >> 28) + 1;      // Total number of vertex attributes
+			fixedAttribMask = getBits<16, 12>(value);  // Determines which vertex attributes are fixed for all vertices
 			break;

 		case ColourBufferLoc: {
 			u32 loc = (value & 0x0fffffff) << 3;
-			renderer.setColourBufferLoc(loc);
+			renderer->setColourBufferLoc(loc);
 			break;
 		};

 		case ColourBufferFormat: {
 			u32 format = getBits<16, 3>(value);
-			renderer.setColourFormat(static_cast<PICA::ColorFmt>(format));
+			renderer->setColourFormat(static_cast<PICA::ColorFmt>(format));
 			break;
 		}

 		case DepthBufferLoc: {
 			u32 loc = (value & 0x0fffffff) << 3;
-			renderer.setDepthBufferLoc(loc);
+			renderer->setDepthBufferLoc(loc);
 			break;
 		}

 		case DepthBufferFormat: {
 			u32 format = value & 0x3;
-			renderer.setDepthFormat(static_cast<PICA::DepthFmt>(format));
+			renderer->setDepthFormat(static_cast<PICA::DepthFmt>(format));
 			break;
 		}

 		case FramebufferSize: {
 			const u32 width = value & 0x7ff;
 			const u32 height = getBits<12, 10>(value) + 1;
-			renderer.setFBSize(width, height);
+			renderer->setFBSize(width, height);
 			break;
 		}

@ -116,7 +117,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		case LightingLUTData4:
 		case LightingLUTData5:
 		case LightingLUTData6:
-		case LightingLUTData7:{
+		case LightingLUTData7: {
 			const uint32_t index = regs[LightingLUTIndex];  // Get full LUT index register
 			const uint32_t lutID = getBits<8, 5>(index);    // Get which LUT we're actually writing to
 			uint32_t lutIndex = getBits<0, 8>(index);       // And get the index inside the LUT we're writing to
@ -133,15 +134,22 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			break;
 		}

-		case VertexFloatUniformIndex:
+		case VertexFloatUniformIndex: {
 			shaderUnit.vs.setFloatUniformIndex(value);
 			break;
+		}

-		case VertexFloatUniformData0: case VertexFloatUniformData1: case VertexFloatUniformData2:
-		case VertexFloatUniformData3: case VertexFloatUniformData4: case VertexFloatUniformData5:
-		case VertexFloatUniformData6: case VertexFloatUniformData7:
+		case VertexFloatUniformData0:
+		case VertexFloatUniformData1:
+		case VertexFloatUniformData2:
+		case VertexFloatUniformData3:
+		case VertexFloatUniformData4:
+		case VertexFloatUniformData5:
+		case VertexFloatUniformData6:
+		case VertexFloatUniformData7: {
 			shaderUnit.vs.uploadFloatUniform(value);
 			break;
+		}

 		case FixedAttribIndex:
 			fixedAttribCount = 0;
@ -162,7 +170,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 			}
 			break;

-		case FixedAttribData0: case FixedAttribData1: case FixedAttribData2:
+		case FixedAttribData0:
+		case FixedAttribData1:
+		case FixedAttribData2:
 			fixedAttrBuff[fixedAttribCount++] = value;

 			if (fixedAttribCount == 3) {
@ -170,15 +180,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {

 				vec4f attr;
 				// These are stored in the reverse order anyone would expect them to be in
-				attr.x() = f24::fromRaw(fixedAttrBuff[2] & 0xffffff);
-				attr.y() = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24));
-				attr.z() = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16));
-				attr.w() = f24::fromRaw(fixedAttrBuff[0] >> 8);
+				attr[0] = f24::fromRaw(fixedAttrBuff[2] & 0xffffff);
+				attr[1] = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24));
+				attr[2] = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16));
+				attr[3] = f24::fromRaw(fixedAttrBuff[0] >> 8);

 				// If the fixed attribute index is < 12, we're just writing to one of the fixed attributes
 				if (fixedAttribIndex < 12) [[likely]] {
 					shaderUnit.vs.fixedAttributes[fixedAttribIndex++] = attr;
-				} else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex
+				} else if (fixedAttribIndex == 15) {  // Otherwise if it's 15, we're submitting an immediate mode vertex
 					const uint totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1;
 					if (totalAttrCount <= immediateModeAttrIndex) {
 						printf("Broken state in the immediate mode vertex submission pipeline. Failing silently\n");
@ -199,13 +209,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
-							renderer.drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
+							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);

 							switch (primType) {
 								// Triangle or geometry primitive. Draw a triangle and discard all vertices
-								case 0: case 3:
+								case 0:
+								case 3: {
 									immediateModeVertIndex = 0;
 									break;
+								}

 								// Triangle strip. Draw triangle, discard first vertex and keep the last 2
 								case 1:
@ -223,54 +235,72 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 							}
 						}
 					}
-				} else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see
+				} else {  // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see
 					log("Wrote to invalid fixed vertex attribute %d\n", fixedAttribIndex);
 				}
 			}

 			break;

-		case VertexShaderOpDescriptorIndex:
+		case VertexShaderOpDescriptorIndex: {
 			shaderUnit.vs.setOpDescriptorIndex(value);
 			break;
+		}

-		case VertexShaderOpDescriptorData0: case VertexShaderOpDescriptorData1: case VertexShaderOpDescriptorData2:
-		case VertexShaderOpDescriptorData3: case VertexShaderOpDescriptorData4: case VertexShaderOpDescriptorData5:
-		case VertexShaderOpDescriptorData6: case VertexShaderOpDescriptorData7:
+		case VertexShaderOpDescriptorData0:
+		case VertexShaderOpDescriptorData1:
+		case VertexShaderOpDescriptorData2:
+		case VertexShaderOpDescriptorData3:
+		case VertexShaderOpDescriptorData4:
+		case VertexShaderOpDescriptorData5:
+		case VertexShaderOpDescriptorData6:
+		case VertexShaderOpDescriptorData7: {
 			shaderUnit.vs.uploadDescriptor(value);
 			break;
+		}

-		case VertexBoolUniform:
+		case VertexBoolUniform: {
 			shaderUnit.vs.boolUniform = value & 0xffff;
 			break;
+		}

-		case VertexIntUniform0: case VertexIntUniform1: case VertexIntUniform2: case VertexIntUniform3:
+		case VertexIntUniform0:
+		case VertexIntUniform1:
+		case VertexIntUniform2:
+		case VertexIntUniform3: {
 			shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value);
 			break;
+		}

-		case VertexShaderData0: case VertexShaderData1: case VertexShaderData2: case VertexShaderData3:
-		case VertexShaderData4: case VertexShaderData5: case VertexShaderData6: case VertexShaderData7:
+		case VertexShaderData0:
+		case VertexShaderData1:
+		case VertexShaderData2:
+		case VertexShaderData3:
+		case VertexShaderData4:
+		case VertexShaderData5:
+		case VertexShaderData6:
+		case VertexShaderData7: {
 			shaderUnit.vs.uploadWord(value);
 			break;
+		}

-		case VertexShaderEntrypoint:
+		case VertexShaderEntrypoint: {
 			shaderUnit.vs.entrypoint = value & 0xffff;
 			break;
+		}

 		case VertexShaderTransferEnd:
 			if (value != 0) shaderUnit.vs.finalize();
 			break;

-		case VertexShaderTransferIndex:
-			shaderUnit.vs.setBufferIndex(value);
-			break;
+		case VertexShaderTransferIndex: shaderUnit.vs.setBufferIndex(value); break;

 		// Command lists can write to the command processor registers and change the command list stream
 		// Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land
 		case CmdBufTrigger0:
 		case CmdBufTrigger1: {
-			if (value != 0) { // A non-zero value triggers command list processing
-				int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1)
+			if (value != 0) {                              // A non-zero value triggers command list processing
+				int bufferIndex = index - CmdBufTrigger0;  // Index of the command buffer to execute (0 or 1)
 				u32 addr = (regs[CmdBufAddr0 + bufferIndex] & 0xfffffff) << 3;
 				u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3;

@ -285,15 +315,13 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		default:
 			// Vertex attribute registers
 			if (index >= AttribInfoStart && index <= AttribInfoEnd) {
-				uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to
-				uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to?
+				uint attributeIndex = (index - AttribInfoStart) / 3;  // Which attribute are we writing to
+				uint reg = (index - AttribInfoStart) % 3;             // Which of this attribute's registers are we writing to?
 				auto& attr = attributeInfo[attributeIndex];

 				switch (reg) {
-					case 0: attr.offset = value & 0xfffffff; break; // Attribute offset
-					case 1: 
-						attr.config1 = value;
-						break;
+					case 0: attr.offset = value & 0xfffffff; break;  // Attribute offset
+					case 1: attr.config1 = value; break;
 					case 2:
 						attr.config2 = value;
 						attr.size = getBits<16, 8>(value);
@ -339,13 +367,13 @@ void GPU::startCommandList(u32 addr, u32 size) {

 		u32 id = header & 0xffff;
 		u32 paramMaskIndex = getBits<16, 4>(header);
-		u32 paramCount = getBits<20, 8>(header); // Number of additional parameters
+		u32 paramCount = getBits<20, 8>(header);  // Number of additional parameters
 		// Bit 31 tells us whether this command is going to write to multiple sequential registers (if the bit is 1)
 		// Or if all written values will go to the same register (If the bit is 0). It's essentially the value that
 		// gets added to the "id" field after each register write
 		bool consecutiveWritingMode = (header >> 31) != 0;

-		u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask
+		u32 mask = maskLUT[paramMaskIndex];  // Actual parameter mask
 		// Increment the ID by 1 after each write if we're in consecutive mode, or 0 otherwise
 		u32 idIncrement = (consecutiveWritingMode) ? 1 : 0;

--- a/src/core/PICA/shader_interpreter.cpp
+++ b/src/core/PICA/shader_interpreter.cpp
@ -1,6 +1,7 @@
-#include "PICA/shader.hpp"
 #include <cmath>

+#include "PICA/shader.hpp"
+
 using namespace Helpers;

 void PICAShader::run() {
@ -11,20 +12,23 @@ void PICAShader::run() {

 	while (true) {
 		const u32 instruction = loadedShader[pc++];
-		const u32 opcode = instruction >> 26; // Top 6 bits are the opcode
+		const u32 opcode = instruction >> 26;  // Top 6 bits are the opcode

 		switch (opcode) {
 			case ShaderOpcodes::ADD: add(instruction); break;
 			case ShaderOpcodes::CALL: call(instruction); break;
 			case ShaderOpcodes::CALLC: callc(instruction); break;
 			case ShaderOpcodes::CALLU: callu(instruction); break;
-			case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: 
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
 				cmp(instruction);
 				break;
+			}
+
 			case ShaderOpcodes::DP3: dp3(instruction); break;
 			case ShaderOpcodes::DP4: dp4(instruction); break;
 			case ShaderOpcodes::DPHI: dphi(instruction); break;
-			case ShaderOpcodes::END: return; // Stop running shader
+			case ShaderOpcodes::END: return;  // Stop running shader
 			case ShaderOpcodes::EX2: ex2(instruction); break;
 			case ShaderOpcodes::FLR: flr(instruction); break;
 			case ShaderOpcodes::IFC: ifc(instruction); break;
@ -38,31 +42,47 @@ void PICAShader::run() {
 			case ShaderOpcodes::MOV: mov(instruction); break;
 			case ShaderOpcodes::MOVA: mova(instruction); break;
 			case ShaderOpcodes::MUL: mul(instruction); break;
-			case ShaderOpcodes::NOP: break; // Do nothing
+			case ShaderOpcodes::NOP: break;  // Do nothing
 			case ShaderOpcodes::RCP: rcp(instruction); break;
 			case ShaderOpcodes::RSQ: rsq(instruction); break;
 			case ShaderOpcodes::SGEI: sgei(instruction); break;
 			case ShaderOpcodes::SLT: slt(instruction); break;
 			case ShaderOpcodes::SLTI: slti(instruction); break;

-			case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
+			case 0x30:
+			case 0x31:
+			case 0x32:
+			case 0x33:
+			case 0x34:
+			case 0x35:
+			case 0x36:
+			case 0x37: {
 				madi(instruction);
 				break;
+			}

-			case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F:
+			case 0x38:
+			case 0x39:
+			case 0x3A:
+			case 0x3B:
+			case 0x3C:
+			case 0x3D:
+			case 0x3E:
+			case 0x3F: {
 				mad(instruction);
 				break;
+			}

-			default:Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode);
+			default: Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode);
 		}

 		// Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL
 		// Handle loop
 		if (loopIndex != 0) {
 			auto& loop = loopInfo[loopIndex - 1];
-			if (pc == loop.endingPC) { // Check if the loop needs to start over
+			if (pc == loop.endingPC) {  // Check if the loop needs to start over
 				loop.iterations -= 1;
-				if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack
+				if (loop.iterations == 0)  // If the loop ended, go one level down on the loop stack
 					loopIndex -= 1;

 				loopCounter += loop.increment;
@ -73,7 +93,7 @@ void PICAShader::run() {
 		// Handle ifs
 		if (ifIndex != 0) {
 			auto& info = conditionalInfo[ifIndex - 1];
-			if (pc == info.endingPC) { // Check if the IF block ended
+			if (pc == info.endingPC) {  // Check if the IF block ended
 				pc = info.newPC;
 				ifIndex -= 1;
 			}
@ -82,7 +102,7 @@ void PICAShader::run() {
 		// Handle calls
 		if (callIndex != 0) {
 			auto& info = callInfo[callIndex - 1];
-			if (pc == info.endingPC) { // Check if the CALL block ended
+			if (pc == info.endingPC) {  // Check if the CALL block ended
 				pc = info.returnPC;
 				callIndex -= 1;
 			}
@ -92,15 +112,15 @@ void PICAShader::run() {

 // Calculate the actual source value using an instruction's source field and it's respective index value
 // The index value is used to apply relative addressing when index != 0 by adding one of the 3 addr registers to the
-// source field, but only with the original source field is pointing at a vector uniform register 
+// source field, but only with the original source field is pointing at a vector uniform register
 u8 PICAShader::getIndexedSource(u32 source, u32 index) {
-	if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg
+	if (source < 0x20)  // No offset is applied if the source isn't pointing to a vector uniform reg
 		return source;

 	switch (index) {
-		case 0: [[likely]] return u8(source); // No offset applied
-		case 1: return u8(source + addrRegister.x());
-		case 2: return u8(source + addrRegister.y());
+		case 0: [[likely]] return u8(source);  // No offset applied
+		case 1: return u8(source + addrRegister[0]);
+		case 2: return u8(source + addrRegister[1]);
 		case 3: return u8(source + loopCounter);
 	}

@ -117,7 +137,7 @@ PICAShader::vec4f PICAShader::getSource(u32 source) {
 		return floatUniforms[source - 0x20];
 	else {
 		Helpers::warn("[PICA] Unimplemented source value: %X\n", source);
-		return vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
+		return vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
 	}
 }

@ -136,13 +156,13 @@ bool PICAShader::isCondTrue(u32 instruction) {
 	bool refX = (getBit<25>(instruction)) != 0;

 	switch (condition) {
-		case 0: // Either cmp register matches 
+		case 0:  // Either cmp register matches
 			return cmpRegister[0] == refX || cmpRegister[1] == refY;
-		case 1: // Both cmp registers match
+		case 1:  // Both cmp registers match
 			return cmpRegister[0] == refX && cmpRegister[1] == refY;
-		case 2: // At least cmp.x matches
+		case 2:  // At least cmp.x matches
 			return cmpRegister[0] == refX;
-		default: // At least cmp.y matches
+		default:  // At least cmp.y matches
 			return cmpRegister[1] == refY;
 	}
 }
@ -150,7 +170,7 @@ bool PICAShader::isCondTrue(u32 instruction) {
 void PICAShader::add(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -171,7 +191,7 @@ void PICAShader::add(u32 instruction) {
 void PICAShader::mul(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -210,7 +230,7 @@ void PICAShader::flr(u32 instruction) {
 void PICAShader::max(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -232,7 +252,7 @@ void PICAShader::max(u32 instruction) {
 void PICAShader::min(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -278,16 +298,16 @@ void PICAShader::mova(u32 instruction) {
 	vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor);

 	u32 componentMask = operandDescriptor & 0xf;
-	if (componentMask & 0b1000) // x component
-		addrRegister.x() = static_cast<s32>(srcVector.x().toFloat32());
-	if (componentMask & 0b0100) // y component
-		addrRegister.y() = static_cast<s32>(srcVector.y().toFloat32());
+	if (componentMask & 0b1000)  // x component
+		addrRegister[0] = static_cast<s32>(srcVector[0].toFloat32());
+	if (componentMask & 0b0100)  // y component
+		addrRegister[1] = static_cast<s32>(srcVector[1].toFloat32());
 }

 void PICAShader::dp3(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -309,7 +329,7 @@ void PICAShader::dp3(u32 instruction) {
 void PICAShader::dp4(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -480,7 +500,7 @@ void PICAShader::madi(u32 instruction) {
 void PICAShader::slt(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);

@ -542,11 +562,11 @@ void PICAShader::slti(u32 instruction) {
 void PICAShader::cmp(u32 instruction) {
 	const u32 operandDescriptor = operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
-	const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 cmpY = getBits<21, 3>(instruction);
 	const u32 cmpX = getBits<24, 3>(instruction);
-	const u32 cmpOperations[2] = { cmpX, cmpY };
+	const u32 cmpOperations[2] = {cmpX, cmpY};

 	if (idx) Helpers::panic("[PICA] CMP: idx != 0");
 	vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);
@ -554,33 +574,34 @@ void PICAShader::cmp(u32 instruction) {

 	for (int i = 0; i < 2; i++) {
 		switch (cmpOperations[i]) {
-			case 0: // Equal
+			case 0:  // Equal
 				cmpRegister[i] = srcVec1[i] == srcVec2[i];
 				break;

-			case 1: // Not equal
+			case 1:  // Not equal
 				cmpRegister[i] = srcVec1[i] != srcVec2[i];
 				break;

-			case 2: // Less than
+			case 2:  // Less than
 				cmpRegister[i] = srcVec1[i] < srcVec2[i];
 				break;

-			case 3: // Less than or equal
+			case 3:  // Less than or equal
 				cmpRegister[i] = srcVec1[i] <= srcVec2[i];
 				break;

-			case 4: // Greater than
+			case 4:  // Greater than
 				cmpRegister[i] = srcVec1[i] > srcVec2[i];
 				break;

-			case 5: // Greater than or equal
+			case 5:  // Greater than or equal
 				cmpRegister[i] = srcVec1[i] >= srcVec2[i];
 				break;

-			default:
+			default: {
 				cmpRegister[i] = true;
 				break;
+			}
 		}
 	}
 }
@ -604,7 +625,7 @@ void PICAShader::ifc(u32 instruction) {

 void PICAShader::ifu(u32 instruction) {
 	const u32 dest = getBits<10, 12>(instruction);
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check

 	if (boolUniform & (1 << bit)) {
 		if (ifIndex >= 8) [[unlikely]]
@ -615,8 +636,7 @@ void PICAShader::ifu(u32 instruction) {
 		auto& block = conditionalInfo[ifIndex++];
 		block.endingPC = dest;
 		block.newPC = dest + num;
-	}
-	else {
+	} else {
 		pc = dest;
 	}
 }
@ -637,12 +657,12 @@ void PICAShader::call(u32 instruction) {

 void PICAShader::callc(u32 instruction) {
 	if (isCondTrue(instruction)) {
-		call(instruction); // Pls inline
+		call(instruction);  // Pls inline
 	}
 }

 void PICAShader::callu(u32 instruction) {
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check

 	if (boolUniform & (1 << bit)) {
 		if (callIndex >= 4) [[unlikely]]
@ -664,26 +684,27 @@ void PICAShader::loop(u32 instruction) {
 		Helpers::panic("[PICA] Overflowed loop stack");

 	u32 dest = getBits<10, 12>(instruction);
-	auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from
-	loopCounter = uniform.y();
+	auto& uniform = intUniforms[getBits<22, 2>(instruction)];  // The uniform we'll get loop info from
+	loopCounter = uniform[1];
 	auto& loop = loopInfo[loopIndex++];

 	loop.startingPC = pc;
-	loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here
-	loop.iterations = uniform.x() + 1;
-	loop.increment = uniform.z();
+	loop.endingPC = dest + 1;  // Loop is inclusive so we need + 1 here
+	loop.iterations = uniform[0] + 1;
+	loop.increment = uniform[2];
 }

 void PICAShader::jmpc(u32 instruction) {
-	if (isCondTrue(instruction))
+	if (isCondTrue(instruction)) {
 		pc = getBits<10, 12>(instruction);
+	}
 }

 void PICAShader::jmpu(u32 instruction) {
-	const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false
+	const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we want to compare to true, otherwise compare to false
 	const u32 dest = getBits<10, 12>(instruction);
-	const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
+	const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check

-	if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want
+	if (((boolUniform >> bit) & 1) == test)  // Jump if the bool uniform is the value we want
 		pc = dest;
 }
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@ -1,4 +1,5 @@
 #include "PICA/shader_unit.hpp"
+
 #include "cityhash.hpp"

 void ShaderUnit::reset() {
@ -18,18 +19,18 @@ void PICAShader::reset() {
 	opDescriptorIndex = 0;
 	f32UniformTransfer = false;

-	const vec4f zero = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
+	const vec4f zero = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
 	inputs.fill(zero);
 	floatUniforms.fill(zero);
 	outputs.fill(zero);
 	tempRegisters.fill(zero);

 	for (auto& e : intUniforms) {
-		e.x() = e.y() = e.z() = e.w() = 0;
+		e[0] = e[1] = e[2] = e[3] = 0;
 	}

-	addrRegister.x() = 0;
-	addrRegister.y() = 0;
+	addrRegister[0] = 0;
+	addrRegister[1] = 0;
 	loopCounter = 0;

 	codeHashDirty = true;
--- a/src/core/renderer_gl/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@ -0,0 +1,53 @@
+#include "renderer_gl/gl_state.hpp"
+
+void GLStateManager::resetBlend() {
+	blendEnabled = false;
+	OpenGL::disableBlend();
+}
+
+void GLStateManager::resetColourMask() {
+	redMask = greenMask = blueMask = alphaMask = true;
+	OpenGL::setColourMask(redMask, greenMask, blueMask, alphaMask);
+}
+
+void GLStateManager::resetDepth() {
+	depthEnabled = false;
+	depthMask = true;
+	depthFunc = GL_LESS;
+
+	OpenGL::disableDepth();
+	OpenGL::setDepthMask(true);
+	OpenGL::setDepthFunc(OpenGL::DepthFunc::Less);
+}
+
+void GLStateManager::resetScissor() {
+	scissorEnabled = false;
+	OpenGL::disableScissor();
+	OpenGL::setScissor(0, 0, 0, 0);
+}
+
+void GLStateManager::resetVAO() {
+	boundVAO = 0;
+	glBindVertexArray(0);
+}
+
+void GLStateManager::resetVBO() {
+	boundVBO = 0;
+	glBindBuffer(GL_ARRAY_BUFFER, 0);
+}
+
+void GLStateManager::resetProgram() {
+	currentProgram = 0;
+	glUseProgram(0);
+}
+
+void GLStateManager::reset() {
+	resetBlend();
+	resetColourMask();
+	resetDepth();
+
+	resetVAO();
+	resetVBO();
+	resetProgram();
+	resetScissor();
+}
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -1,4 +1,7 @@
 #include "renderer_gl/renderer_gl.hpp"
+
+#include <stb_image_write.h>
+
 #include "PICA/float_types.hpp"
 #include "PICA/gpu.hpp"
 #include "PICA/regs.hpp"
@ -576,7 +579,7 @@ const char* displayFragmentShader = R"(
    }
 )";

-void Renderer::reset() {
+void RendererGL::reset() {
 	depthBufferCache.reset();
 	colourBufferCache.reset();
 	textureCache.reset();
@ -592,10 +595,10 @@ void Renderer::reset() {
 		const auto oldProgram = OpenGL::getProgram();

 		gl.useProgram(triangleProgram);
-		
-		oldDepthScale = -1.0; // Default depth scale to -1.0, which is what games typically use
-		oldDepthOffset = 0.0; // Default depth offset to 0
-		oldDepthmapEnable = false; // Enable w buffering
+
+		oldDepthScale = -1.0;       // Default depth scale to -1.0, which is what games typically use
+		oldDepthOffset = 0.0;       // Default depth offset to 0
+		oldDepthmapEnable = false;  // Enable w buffering

 		glUniform1f(depthScaleLoc, oldDepthScale);
 		glUniform1f(depthOffsetLoc, oldDepthOffset);
@ -605,10 +608,12 @@ void Renderer::reset() {
 	}
 }

-void Renderer::initGraphicsContext() {
+void RendererGL::initGraphicsContext() {
+	gl.reset();
+
 	OpenGL::Shader vert(vertexShader, OpenGL::Vertex);
 	OpenGL::Shader frag(fragmentShader, OpenGL::Fragment);
-	triangleProgram.create({ vert, frag });
+	triangleProgram.create({vert, frag});
 	gl.useProgram(triangleProgram);

 	textureEnvSourceLoc = OpenGL::uniformLocation(triangleProgram, "u_textureEnvSource");
@ -630,10 +635,10 @@ void Renderer::initGraphicsContext() {

 	OpenGL::Shader vertDisplay(displayVertexShader, OpenGL::Vertex);
 	OpenGL::Shader fragDisplay(displayFragmentShader, OpenGL::Fragment);
-	displayProgram.create({ vertDisplay, fragDisplay });
+	displayProgram.create({vertDisplay, fragDisplay});

 	gl.useProgram(displayProgram);
-	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object
+	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object

 	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
@ -669,10 +674,10 @@ void Renderer::initGraphicsContext() {
 	dummyVAO.create();

 	// Create texture and framebuffer for the 3DS screen
-	const u32 screenTextureWidth = 400; // Top screen is 400 pixels wide, bottom is 320
-	const u32 screenTextureHeight = 2 * 240; // Both screens are 240 pixels tall
-	
-	glGenTextures(1,&lightLUTTextureArray);
+	const u32 screenTextureWidth = 400;       // Top screen is 400 pixels wide, bottom is 320
+	const u32 screenTextureHeight = 2 * 240;  // Both screens are 240 pixels tall
+
+	glGenTextures(1, &lightLUTTextureArray);

 	auto prevTexture = OpenGL::getTex2D();
 	screenTexture.create(screenTextureWidth, screenTextureHeight, GL_RGBA8);
@ -684,8 +689,7 @@ void Renderer::initGraphicsContext() {
 	screenFramebuffer.createWithDrawTexture(screenTexture);
 	screenFramebuffer.bind(OpenGL::DrawAndReadFramebuffer);

-	if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE)
-		Helpers::panic("Incomplete framebuffer");
+	if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) Helpers::panic("Incomplete framebuffer");

 	// TODO: This should not clear the framebuffer contents. It should load them from VRAM.
 	GLint oldViewport[4];
@ -699,19 +703,32 @@ void Renderer::initGraphicsContext() {
 }

 // Set up the OpenGL blending context to match the emulated PICA
-void Renderer::setupBlending() {
+void RendererGL::setupBlending() {
 	const bool blendingEnabled = (regs[PICA::InternalRegs::ColourOperation] & (1 << 8)) != 0;
-	
+
 	// Map of PICA blending equations to OpenGL blending equations. The unused blending equations are equivalent to equation 0 (add)
 	static constexpr std::array<GLenum, 8> blendingEquations = {
-		GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD
+		GL_FUNC_ADD, GL_FUNC_SUBTRACT, GL_FUNC_REVERSE_SUBTRACT, GL_MIN, GL_MAX, GL_FUNC_ADD, GL_FUNC_ADD, GL_FUNC_ADD,
 	};
-	
+
 	// Map of PICA blending funcs to OpenGL blending funcs. Func = 15 is undocumented and stubbed to GL_ONE for now
 	static constexpr std::array<GLenum, 16> blendingFuncs = {
-		GL_ZERO, GL_ONE, GL_SRC_COLOR, GL_ONE_MINUS_SRC_COLOR, GL_DST_COLOR, GL_ONE_MINUS_DST_COLOR, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA,
-		GL_DST_ALPHA, GL_ONE_MINUS_DST_ALPHA, GL_CONSTANT_COLOR, GL_ONE_MINUS_CONSTANT_COLOR, GL_CONSTANT_ALPHA, GL_ONE_MINUS_CONSTANT_ALPHA,
-		GL_SRC_ALPHA_SATURATE, GL_ONE
+		GL_ZERO,
+		GL_ONE,
+		GL_SRC_COLOR,
+		GL_ONE_MINUS_SRC_COLOR,
+		GL_DST_COLOR,
+		GL_ONE_MINUS_DST_COLOR,
+		GL_SRC_ALPHA,
+		GL_ONE_MINUS_SRC_ALPHA,
+		GL_DST_ALPHA,
+		GL_ONE_MINUS_DST_ALPHA,
+		GL_CONSTANT_COLOR,
+		GL_ONE_MINUS_CONSTANT_COLOR,
+		GL_CONSTANT_ALPHA,
+		GL_ONE_MINUS_CONSTANT_ALPHA,
+		GL_SRC_ALPHA_SATURATE,
+		GL_ONE,
 	};

 	if (!blendingEnabled) {
@ -743,13 +760,12 @@ void Renderer::setupBlending() {
 	}
 }

-void Renderer::setupTextureEnvState() {
+void RendererGL::setupTextureEnvState() {
 	// TODO: Only update uniforms when the TEV config changed. Use an UBO potentially.

 	static constexpr std::array<u32, 6> ioBases = {
-	  PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source,
-	  PICA::InternalRegs::TexEnv2Source, PICA::InternalRegs::TexEnv3Source,
-	  PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source
+		PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, PICA::InternalRegs::TexEnv2Source,
+		PICA::InternalRegs::TexEnv3Source, PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source,
 	};

 	u32 textureEnvSourceRegs[6];
@ -775,9 +791,11 @@ void Renderer::setupTextureEnvState() {
 	glUniform1uiv(textureEnvScaleLoc, 6, textureEnvScaleRegs);
 }

-void Renderer::bindTexturesToSlots() {
+void RendererGL::bindTexturesToSlots() {
 	static constexpr std::array<u32, 3> ioBases = {
-	  PICA::InternalRegs::Tex0BorderColor, PICA::InternalRegs::Tex1BorderColor, PICA::InternalRegs::Tex2BorderColor
+		PICA::InternalRegs::Tex0BorderColor,
+		PICA::InternalRegs::Tex1BorderColor,
+		PICA::InternalRegs::Tex2BorderColor,
 	};

 	for (int i = 0; i < 3; i++) {
@ -805,13 +823,13 @@ void Renderer::bindTexturesToSlots() {
 	glActiveTexture(GL_TEXTURE0);
 }

-void Renderer::updateLightingLUT() {
+void RendererGL::updateLightingLUT() {
 	gpu.lightingLUTDirty = false;
-	std::array<u16, GPU::LightingLutSize> u16_lightinglut; 
-	
+	std::array<u16, GPU::LightingLutSize> u16_lightinglut;
+
 	for (int i = 0; i < gpu.lightingLUT.size(); i++) {
-		uint64_t value =  gpu.lightingLUT[i] & ((1 << 12) - 1);
-		u16_lightinglut[i] = value * 65535 / 4095; 
+		uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1);
+		u16_lightinglut[i] = value * 65535 / 4095;
 	}

 	glActiveTexture(GL_TEXTURE0 + 3);
@ -824,19 +842,22 @@ void Renderer::updateLightingLUT() {
 	glActiveTexture(GL_TEXTURE0);
 }

-void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> vertices) {
+void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> vertices) {
 	// The fourth type is meant to be "Geometry primitive". TODO: Find out what that is
 	static constexpr std::array<OpenGL::Primitives, 4> primTypes = {
-	  OpenGL::Triangle, OpenGL::TriangleStrip, OpenGL::TriangleFan, OpenGL::Triangle
+		OpenGL::Triangle,
+		OpenGL::TriangleStrip,
+		OpenGL::TriangleFan,
+		OpenGL::Triangle,
 	};
-	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];

+	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
 	gl.bindVBO(vbo);
 	gl.bindVAO(vao);
 	gl.useProgram(triangleProgram);

-	OpenGL::enableClipPlane(0); // Clipping plane 0 is always enabled
+	OpenGL::enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
 		OpenGL::enableClipPlane(1);
 	}
@ -852,9 +873,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);

-	static constexpr std::array<GLenum, 8> depthModes = {
-		GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL
-	};
+	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};

 	const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
 	const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
@ -865,7 +884,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 		oldDepthScale = depthScale;
 		glUniform1f(depthScaleLoc, depthScale);
 	}
-	
+
 	if (oldDepthOffset != depthOffset) {
 		oldDepthOffset = depthOffset;
 		glUniform1f(depthOffsetLoc, depthOffset);
@ -917,7 +936,7 @@ void Renderer::drawVertices(PICA::PrimType primType, std::span<const Vertex> ver
 constexpr u32 topScreenBuffer = 0x1f000000;
 constexpr u32 bottomScreenBuffer = 0x1f05dc00;

-void Renderer::display() {
+void RendererGL::display() {
 	gl.disableScissor();

 	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
@ -925,7 +944,7 @@ void Renderer::display() {
 	glBlitFramebuffer(0, 0, 400, 480, 0, 0, 400, 480, GL_COLOR_BUFFER_BIT, GL_LINEAR);
 }

-void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
+void RendererGL::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
 	return;
 	log("GPU: Clear buffer\nStart: %08X End: %08X\nValue: %08X Control: %08X\n", startAddress, endAddress, value, control);

@ -947,10 +966,10 @@ void Renderer::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 cont
 	OpenGL::clearColor();
 }

-OpenGL::Framebuffer Renderer::getColourFBO() {
-	//We construct a colour buffer object and see if our cache has any matching colour buffers in it
-	// If not, we allocate a texture & FBO for our framebuffer and store it in the cache 
-	ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize.x(), fbSize.y());
+OpenGL::Framebuffer RendererGL::getColourFBO() {
+	// We construct a colour buffer object and see if our cache has any matching colour buffers in it
+	//  If not, we allocate a texture & FBO for our framebuffer and store it in the cache
+	ColourBuffer sampleBuffer(colourBufferLoc, colourBufferFormat, fbSize[0], fbSize[1]);
 	auto buffer = colourBufferCache.find(sampleBuffer);

 	if (buffer.has_value()) {
@ -960,9 +979,9 @@ OpenGL::Framebuffer Renderer::getColourFBO() {
 	}
 }

-void Renderer::bindDepthBuffer() {
+void RendererGL::bindDepthBuffer() {
 	// Similar logic as the getColourFBO function
-	DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize.x(), fbSize.y());
+	DepthBuffer sampleBuffer(depthBufferLoc, depthBufferFormat, fbSize[0], fbSize[1]);
 	auto buffer = depthBufferCache.find(sampleBuffer);
 	GLuint tex;

@ -979,14 +998,14 @@ void Renderer::bindDepthBuffer() {
 	glFramebufferTexture2D(GL_FRAMEBUFFER, attachment, GL_TEXTURE_2D, tex, 0);
 }

-OpenGL::Texture Renderer::getTexture(Texture& tex) {
+OpenGL::Texture RendererGL::getTexture(Texture& tex) {
 	// Similar logic as the getColourFBO/bindDepthBuffer functions
 	auto buffer = textureCache.find(tex);

 	if (buffer.has_value()) {
 		return buffer.value().get().texture;
 	} else {
-		const void* textureData = gpu.getPointerPhys<void*>(tex.location); // Get pointer to the texture data in 3DS memory
+		const void* textureData = gpu.getPointerPhys<void*>(tex.location);  // Get pointer to the texture data in 3DS memory
 		Texture& newTex = textureCache.add(tex);
 		newTex.decodeTexture(textureData);

@ -994,7 +1013,7 @@ OpenGL::Texture Renderer::getTexture(Texture& tex) {
 	}
 }

-void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
+void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
 	const u32 inputWidth = inputSize & 0xffff;
 	const u32 inputGap = inputSize >> 16;

@ -1022,21 +1041,21 @@ void Renderer::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32
 	// Hack: Detect whether we are writing to the top or bottom screen by checking output gap and drawing to the proper part of the output texture
 	// We consider output gap == 320 to mean bottom, and anything else to mean top
 	if (outputGap == 320) {
-		OpenGL::setViewport(40, 0, 320, 240); // Bottom screen viewport
+		OpenGL::setViewport(40, 0, 320, 240);  // Bottom screen viewport
 	} else {
-		OpenGL::setViewport(0, 240, 400, 240); // Top screen viewport
+		OpenGL::setViewport(0, 240, 400, 240);  // Top screen viewport
 	}

-	OpenGL::draw(OpenGL::TriangleStrip, 4); // Actually draw our 3DS screen
+	OpenGL::draw(OpenGL::TriangleStrip, 4);  // Actually draw our 3DS screen
 }

-void Renderer::screenshot(const std::string& name) {
+void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;

 	std::vector<uint8_t> pixels, flippedPixels;
-	pixels.resize(width *  height * 4);
-	flippedPixels.resize(pixels.size());;
+	pixels.resize(width * height * 4);
+	flippedPixels.resize(pixels.size());

 	OpenGL::bindScreenFramebuffer();
 	glReadPixels(0, 0, width, height, GL_BGRA, GL_UNSIGNED_BYTE, pixels.data());