Merge remote-tracking branch 'upstream/master' into memory_rework

2025-05-26 00:19:12 +12:00 · 2024-11-09 18:29:40 +02:00 · 2024-11-09 18:29:40 +02:00 · b89a21444e
commit b89a21444e
parent 3d66239851 b214782a15
233 changed files with 17350 additions and 1024 deletions
--- a/.github/gles.patch
+++ b/.github/gles.patch
@ -1,52 +1,3 @@
-diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
-index a11a6ffa..77486a09 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
-+++ b/src/core/renderer_gl/renderer_gl.cpp
-@@ -357,27 +357,27 @@ void RendererGL::bindTexturesToSlots() {
- 	}
- 
- 	glActiveTexture(GL_TEXTURE0 + 3);
-	glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray);
-+	// glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray);
- 	glActiveTexture(GL_TEXTURE0);
- }
- 
- void RendererGL::updateLightingLUT() {
-	gpu.lightingLUTDirty = false;
-	std::array<u16, GPU::LightingLutSize> u16_lightinglut;
-
-	for (int i = 0; i < gpu.lightingLUT.size(); i++) {
-		uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1);
-		u16_lightinglut[i] = value * 65535 / 4095;
-	}
-
-	glActiveTexture(GL_TEXTURE0 + 3);
-	glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray);
-	glTexImage2D(GL_TEXTURE_1D_ARRAY, 0, GL_R16, 256, Lights::LUT_Count, 0, GL_RED, GL_UNSIGNED_SHORT, u16_lightinglut.data());
-	glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-	glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-	glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-	glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-	glActiveTexture(GL_TEXTURE0);
-+	// gpu.lightingLUTDirty = false;
-+	// std::array<u16, GPU::LightingLutSize> u16_lightinglut;
-+
-+	// for (int i = 0; i < gpu.lightingLUT.size(); i++) {
-+	// 	uint64_t value = gpu.lightingLUT[i] & ((1 << 12) - 1);
-+	// 	u16_lightinglut[i] = value * 65535 / 4095;
-+	// }
-+
-+	// glActiveTexture(GL_TEXTURE0 + 3);
-+	// glBindTexture(GL_TEXTURE_1D_ARRAY, lightLUTTextureArray);
-+	// glTexImage2D(GL_TEXTURE_1D_ARRAY, 0, GL_R16, 256, Lights::LUT_Count, 0, GL_RED, GL_UNSIGNED_SHORT, u16_lightinglut.data());
-+	// glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-+	// glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-+	// glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-+	// glTexParameteri(GL_TEXTURE_1D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-+	// glActiveTexture(GL_TEXTURE0);
- }
- 
- void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> vertices) {
 diff --git a/src/host_shaders/opengl_display.frag b/src/host_shaders/opengl_display.frag
 index 612671c8..1937f711 100644
 --- a/src/host_shaders/opengl_display.frag
@ -70,7 +21,7 @@ index 990e2f80..2e7842ac 100644
 
 void main() {
 diff --git a/src/host_shaders/opengl_fragment_shader.frag b/src/host_shaders/opengl_fragment_shader.frag
-index f6fa6c55..bb88e278 100644
+index 9f07df0b..96a35afa 100644
 --- a/src/host_shaders/opengl_fragment_shader.frag
 +++ b/src/host_shaders/opengl_fragment_shader.frag
@@ -1,4 +1,5 @@
@ -78,36 +29,29 @@ index f6fa6c55..bb88e278 100644
 +#version 300 es
 +precision mediump float;
 
- in vec3 v_tangent;
- in vec3 v_normal;
-@@ -27,7 +28,7 @@ uniform bool u_depthmapEnable;
- uniform sampler2D u_tex0;
- uniform sampler2D u_tex1;
- uniform sampler2D u_tex2;
-uniform sampler1DArray u_tex_lighting_lut;
-+// uniform sampler1DArray u_tex_lighting_lut;
+ in vec4 v_quaternion;
+ in vec4 v_colour;
+@@ -41,8 +42,8 @@ vec3 normal;
+ const uint samplerEnabledBitfields[2] = uint[2](0x7170e645u, 0x7f013fefu);
 
- uniform uint u_picaRegs[0x200 - 0x48];
+ bool isSamplerEnabled(uint environment_id, uint lut_id) {
+-	uint index = 7 * environment_id + lut_id;
+-	uint arrayIndex = (index >> 5);
+	uint index = 7u * environment_id + lut_id;
+	uint arrayIndex = (index >> 5u);
+ 	return (samplerEnabledBitfields[arrayIndex] & (1u << (index & 31u))) != 0u;
+ }
 
-@@ -145,16 +146,23 @@ vec4 tevCalculateCombiner(int tev_id) {
- #define RR_LUT 6u
+@@ -166,11 +167,17 @@ float lutLookup(uint lut, int index) {
+ 	return texelFetch(u_tex_luts, ivec2(index, int(lut)), 0).r;
+ }
 
- float lutLookup(uint lut, uint light, float value) {
-	if (lut >= FR_LUT && lut <= RR_LUT) lut -= 1;
-	if (lut == SP_LUT) lut = light + 8;
-	return texture(u_tex_lighting_lut, vec2(value, lut)).r;
-+	// if (lut >= FR_LUT && lut <= RR_LUT) lut -= 1;
-+	// if (lut == SP_LUT) lut = light + 8;
-+	// return texture(u_tex_lighting_lut, vec2(value, lut)).r;
-+	return 0.0;
-+}
-+
-+// some gles versions have bitfieldExtract and complain if you redefine it, some don't and compile error, using this instead
+// some gles versions have bitfieldExtractCompat and complain if you redefine it, some don't and compile error, using this instead
 +uint bitfieldExtractCompat(uint val, int off, int size) {
 +    uint mask = uint((1 << size) - 1);
 +    return uint(val >> off) & mask;
- }
- 
+}
+
 vec3 regToColor(uint reg) {
 	// Normalization scale to convert from [0...255] to [0.0...1.0]
 	const float scale = 1.0 / 255.0;
@ -117,89 +61,115 @@ index f6fa6c55..bb88e278 100644
 }
 
 // Convert an arbitrary-width floating point literal to an f32
-@@ -189,7 +197,7 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 	vec3 view = normalize(v_view);
+@@ -201,7 +208,7 @@ float lightLutLookup(uint environment_id, uint lut_id, uint light_id, vec3 light
+ 		// These are the spotlight attenuation LUTs
+ 		bit_in_config1 = 8 + int(light_id & 7u);
+ 		lut_index = 8u + light_id;
+-	} else if (lut_id <= 6) {
+	} else if (lut_id <= 6u) {
+ 		bit_in_config1 = 16 + int(lut_id);
+ 		lut_index = lut_id;
+ 	} else {
+@@ -210,16 +217,16 @@ float lightLutLookup(uint environment_id, uint lut_id, uint light_id, vec3 light
 
+ 	bool current_sampler_enabled = isSamplerEnabled(environment_id, lut_id); // 7 luts per environment
+ 
+-	if (!current_sampler_enabled || (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, bit_in_config1, 1) != 0u)) {
+	if (!current_sampler_enabled || (bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG1, bit_in_config1, 1) != 0u)) {
+ 		return 1.0;
+ 	}
+ 
+-	uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, int(lut_id) << 2, 3);
+	uint scale_id = bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_SCALE, int(lut_id) << 2, 3);
+ 	float scale = float(1u << scale_id);
+ 	if (scale_id >= 6u) scale /= 256.0;
+ 
+ 	float delta = 1.0;
+-	uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, int(lut_id) << 2, 3);
+	uint input_id = bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_SELECT, int(lut_id) << 2, 3);
+ 	switch (input_id) {
+ 		case 0u: {
+ 			delta = dot(normal, normalize(half_vector));
+@@ -243,9 +250,9 @@ float lightLutLookup(uint environment_id, uint lut_id, uint light_id, vec3 light
+ 
+ 			// Sign extend them. Normally bitfieldExtract would do that but it's missing on some versions
+ 			// of GLSL so we do it manually
+-			int se_x = bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 13);
+-			int se_y = bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 13);
+-			int se_z = bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 13);
+			int se_x = int(bitfieldExtractCompat(uint(GPUREG_LIGHTi_SPOTDIR_LOW), 0, 13));
+			int se_y = int(bitfieldExtractCompat(uint(GPUREG_LIGHTi_SPOTDIR_LOW), 16, 13));
+			int se_z = int(bitfieldExtractCompat(uint(GPUREG_LIGHTi_SPOTDIR_HIGH), 0, 13));
+ 
+ 			if ((se_x & 0x1000) == 0x1000) se_x |= 0xffffe000;
+ 			if ((se_y & 0x1000) == 0x1000) se_y |= 0xffffe000;
+@@ -272,9 +279,9 @@ float lightLutLookup(uint environment_id, uint lut_id, uint light_id, vec3 light
+ 	}
+ 
+ 	// 0 = enabled
+-	if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 1 + (int(lut_id) << 2), 1) == 0u) {
+	if (bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_ABS, 1 + (int(lut_id) << 2), 1) == 0u) {
+ 		// Two sided diffuse
+-		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0u) {
+		if (bitfieldExtractCompat(GPUREG_LIGHTi_CONFIG, 1, 1) == 0u) {
+ 			delta = max(delta, 0.0);
+ 		} else {
+ 			delta = abs(delta);
+@@ -298,7 +305,7 @@ vec3 rotateVec3ByQuaternion(vec3 v, vec4 q) {
+ // Implements the following algorthm: https://mathb.in/26766
+ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
 	uint GPUREG_LIGHTING_ENABLE = readPicaReg(0x008Fu);
 -	if (bitfieldExtract(GPUREG_LIGHTING_ENABLE, 0, 1) == 0u) {
 +	if (bitfieldExtractCompat(GPUREG_LIGHTING_ENABLE, 0, 1) == 0u) {
- 		primary_color = secondary_color = vec4(1.0);
+ 		primary_color = secondary_color = vec4(0.0);
 		return;
 	}
-@@ -213,7 +221,7 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 	bool error_unimpl = false;
+@@ -315,7 +322,7 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+ 	GPUREG_LIGHTING_LUTINPUT_ABS = readPicaReg(0x01D0u);
+ 	GPUREG_LIGHTING_LUTINPUT_SELECT = readPicaReg(0x01D1u);
+ 
+-	uint bump_mode = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 28, 2);
+	uint bump_mode = bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG0, 28, 2);
+ 
+ 	// Bump mode is ignored for now because it breaks some games ie. Toad Treasure Tracker
+ 	switch (bump_mode) {
+@@ -328,15 +335,15 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+ 	vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);
+ 	vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);
+ 
+-	uint environment_id = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 4, 4);
+-	bool clamp_highlights = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 27, 1) == 1u;
+	uint environment_id = bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG0, 4, 4);
+	bool clamp_highlights = bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG0, 27, 1) == 1u;
+ 
+ 	uint light_id;
+ 	vec3 light_vector;
+ 	vec3 half_vector;
 
 	for (uint i = 0u; i < GPUREG_LIGHTING_NUM_LIGHTS; i++) {
-		uint light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i * 3u), 3);
-+		uint light_id = bitfieldExtractCompat(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i * 3u), 3);
+-		light_id = bitfieldExtract(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i) << 2, 3);
+		light_id = bitfieldExtractCompat(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i) << 2, 3);
 
- 		uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140u + 0x10u * light_id);
- 		uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141u + 0x10u * light_id);
-@@ -224,14 +232,14 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 		uint GPUREG_LIGHTi_CONFIG = readPicaReg(0x0149u + 0x10u * light_id);
+ 		uint GPUREG_LIGHTi_SPECULAR0 = readPicaReg(0x0140u + (light_id << 4u));
+ 		uint GPUREG_LIGHTi_SPECULAR1 = readPicaReg(0x0141u + (light_id << 4u));
+@@ -348,12 +355,12 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
 
- 		vec3 light_vector = normalize(vec3(
+ 		float light_distance;
+ 		vec3 light_position = vec3(
 -			decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5u, 10u), decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5u, 10u),
 -			decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5u, 10u)
 +			decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5u, 10u), decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5u, 10u),
 +			decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5u, 10u)
- 		));
- 
- 		vec3 half_vector;
+ 		);
 
 		// Positional Light
 -		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0u) {
 +		if (bitfieldExtractCompat(GPUREG_LIGHTi_CONFIG, 0, 1) == 0u) {
- 			// error_unimpl = true;
- 			half_vector = normalize(normalize(light_vector + v_view) + view);
- 		}
-@@ -242,12 +250,12 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+ 			light_vector = light_position + v_view;
 		}
 
- 		for (int c = 0; c < 7; c++) {
-			if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0u) {
-				uint scale_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3);
-+			if (bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0u) {
-+				uint scale_id = bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_SCALE, c * 4, 3);
- 				float scale = float(1u << scale_id);
- 				if (scale_id >= 6u) scale /= 256.0;
- 
-				uint input_id = bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3);
-+				uint input_id = bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_SELECT, c * 4, 3);
- 				if (input_id == 0u)
- 					d[c] = dot(normal, half_vector);
- 				else if (input_id == 1u)
-@@ -260,9 +268,9 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 					uint GPUREG_LIGHTi_SPOTDIR_LOW = readPicaReg(0x0146u + 0x10u * light_id);
- 					uint GPUREG_LIGHTi_SPOTDIR_HIGH = readPicaReg(0x0147u + 0x10u * light_id);
- 					vec3 spot_light_vector = normalize(vec3(
-						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1u, 11u),
-						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1u, 11u),
-						decodeFP(bitfieldExtract(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1u, 11u)
-+						decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 16), 1u, 11u),
-+						decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 16), 1u, 11u),
-+						decodeFP(bitfieldExtractCompat(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 16), 1u, 11u)
- 					));
- 					d[c] = dot(-light_vector, spot_light_vector);  // -L dot P (aka Spotlight aka SP);
- 				} else if (input_id == 5u) {
-@@ -273,13 +281,13 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 				}
- 
- 				d[c] = lutLookup(uint(c), light_id, d[c] * 0.5 + 0.5) * scale;
-				if (bitfieldExtract(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) d[c] = abs(d[c]);
-+				if (bitfieldExtractCompat(GPUREG_LIGHTING_LUTINPUT_ABS, 2 * c, 1) != 0u) d[c] = abs(d[c]);
- 			} else {
- 				d[c] = 1.0;
- 			}
- 		}
- 
-		uint lookup_config = bitfieldExtract(GPUREG_LIGHTi_CONFIG, 4, 4);
-+		uint lookup_config = bitfieldExtractCompat(GPUREG_LIGHTi_CONFIG, 4, 4);
- 		if (lookup_config == 0u) {
- 			d[D1_LUT] = 0.0;
- 			d[FR_LUT] = 0.0;
-@@ -310,7 +318,7 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 		float NdotL = dot(normal, light_vector);  // Li dot N
+@@ -369,23 +376,23 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+ 		float NdotL = dot(normal, light_vector);  // N dot Li
 
 		// Two sided diffuse
 -		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 1, 1) == 0u)
@ -207,19 +177,40 @@ index f6fa6c55..bb88e278 100644
 			NdotL = max(0.0, NdotL);
 		else
 			NdotL = abs(NdotL);
-@@ -321,8 +329,8 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
- 		secondary_color.rgb += light_factor * (regToColor(GPUREG_LIGHTi_SPECULAR0) * d[D0_LUT] +
- 											   regToColor(GPUREG_LIGHTi_SPECULAR1) * d[D1_LUT] * vec3(d[RR_LUT], d[RG_LUT], d[RB_LUT]));
+ 
+ 		float geometric_factor;
+-		bool use_geo_0 = bitfieldExtract(GPUREG_LIGHTi_CONFIG, 2, 1) == 1u;
+-		bool use_geo_1 = bitfieldExtract(GPUREG_LIGHTi_CONFIG, 3, 1) == 1u;
+		bool use_geo_0 = bitfieldExtractCompat(GPUREG_LIGHTi_CONFIG, 2, 1) == 1u;
+		bool use_geo_1 = bitfieldExtractCompat(GPUREG_LIGHTi_CONFIG, 3, 1) == 1u;
+ 		if (use_geo_0 || use_geo_1) {
+ 			geometric_factor = dot(half_vector, half_vector);
+ 			geometric_factor = geometric_factor == 0.0 ? 0.0 : min(NdotL / geometric_factor, 1.0);
+ 		}
+ 
+ 		float distance_attenuation = 1.0;
+-		if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 24 + int(light_id), 1) == 0u) {
+-			uint GPUREG_LIGHTi_ATTENUATION_BIAS = bitfieldExtract(readPicaReg(0x014Au + (light_id << 4u)), 0, 20);
+-			uint GPUREG_LIGHTi_ATTENUATION_SCALE = bitfieldExtract(readPicaReg(0x014Bu + (light_id << 4u)), 0, 20);
+		if (bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG1, 24 + int(light_id), 1) == 0u) {
+			uint GPUREG_LIGHTi_ATTENUATION_BIAS = bitfieldExtractCompat(readPicaReg(0x014Au + (light_id << 4u)), 0, 20);
+			uint GPUREG_LIGHTi_ATTENUATION_SCALE = bitfieldExtractCompat(readPicaReg(0x014Bu + (light_id << 4u)), 0, 20);
+ 
+ 			float distance_attenuation_bias = decodeFP(GPUREG_LIGHTi_ATTENUATION_BIAS, 7u, 12u);
+ 			float distance_attenuation_scale = decodeFP(GPUREG_LIGHTi_ATTENUATION_SCALE, 7u, 12u);
+@@ -430,8 +437,8 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
+ 		specular_sum.rgb += light_factor * clamp_factor * (specular0 + specular1);
 	}
+ 
 -	uint fresnel_output1 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 2, 1);
 -	uint fresnel_output2 = bitfieldExtract(GPUREG_LIGHTING_CONFIG0, 3, 1);
 +	uint fresnel_output1 = bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG0, 2, 1);
 +	uint fresnel_output2 = bitfieldExtractCompat(GPUREG_LIGHTING_CONFIG0, 3, 1);
- 
- 	if (fresnel_output1 == 1u) primary_color.a = d[FR_LUT];
- 	if (fresnel_output2 == 1u) secondary_color.a = d[FR_LUT];
+ 	// Uses parameters from the last light as Fresnel is only applied to the last light
+ 	float fresnel_factor;
+ 	
 diff --git a/src/host_shaders/opengl_vertex_shader.vert b/src/host_shaders/opengl_vertex_shader.vert
-index a25d7a6d..7cf40398 100644
+index 057f9a88..dc735ced 100644
 --- a/src/host_shaders/opengl_vertex_shader.vert
 +++ b/src/host_shaders/opengl_vertex_shader.vert
@@ -1,4 +1,6 @@
@ -230,7 +221,7 @@ index a25d7a6d..7cf40398 100644
 
 layout(location = 0) in vec4 a_coords;
 layout(location = 1) in vec4 a_quaternion;
-@@ -20,7 +22,7 @@ out vec2 v_texcoord2;
+@@ -18,7 +20,7 @@ out vec2 v_texcoord2;
 flat out vec4 v_textureEnvColor[6];
 flat out vec4 v_textureEnvBufferColor;
 
@ -239,7 +230,7 @@ index a25d7a6d..7cf40398 100644
 
 // TEV uniforms
 uniform uint u_textureEnvColor[6];
-@@ -93,6 +95,6 @@ void main() {
+@@ -81,8 +83,8 @@ void main() {
 	);
 
 	// There's also another, always-on clipping plane based on vertex z
@ -247,16 +238,20 @@ index a25d7a6d..7cf40398 100644
 -	gl_ClipDistance[1] = dot(clipData, a_coords);
 +	// gl_ClipDistance[0] = -a_coords.z;
 +	// gl_ClipDistance[1] = dot(clipData, a_coords);
+ 
+ 	v_quaternion = a_quaternion;
 }
 diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp
-index f368f573..5ead7f63 100644
+index 607815fa..cbfcc096 100644
 --- a/third_party/opengl/opengl.hpp
 +++ b/third_party/opengl/opengl.hpp
-@@ -520,21 +520,21 @@ namespace OpenGL {
+@@ -602,22 +602,22 @@ namespace OpenGL {
+ 	static void disableScissor() { glDisable(GL_SCISSOR_TEST); }
 	static void enableBlend() { glEnable(GL_BLEND); }
 	static void disableBlend() { glDisable(GL_BLEND); }
- 	static void enableLogicOp() { glEnable(GL_COLOR_LOGIC_OP); }
+-	static void enableLogicOp() { glEnable(GL_COLOR_LOGIC_OP); }
 -	static void disableLogicOp() { glDisable(GL_COLOR_LOGIC_OP); }
+	static void enableLogicOp() { /* glEnable(GL_COLOR_LOGIC_OP); */ }
 +	static void disableLogicOp() { /* glDisable(GL_COLOR_LOGIC_OP); */ }
 	static void enableDepth() { glEnable(GL_DEPTH_TEST); }
 	static void disableDepth() { glDisable(GL_DEPTH_TEST); }
--- a/.github/mac-bundle-qt.sh
+++ b/.github/mac-bundle-qt.sh
@ -5,16 +5,16 @@ PATH="$PATH:/usr/libexec"

 # Construct the app iconset.
 mkdir alber.iconset
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png
 iconutil --convert icns alber.iconset

 # Set up the .app directory
--- a/.github/mac-bundle.sh
+++ b/.github/mac-bundle.sh
@ -5,16 +5,16 @@ PATH="$PATH:/usr/libexec"

 # Construct the app iconset.
 mkdir alber.iconset
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png
-convert docs/img/alber-icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 16x16 alber.iconset/icon_16x16.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 32x32 alber.iconset/icon_16x16@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 32x32 alber.iconset/icon_32x32.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 64x64 alber.iconset/icon_32x32@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 128x128 alber.iconset/icon_128x128.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 256x256 alber.iconset/icon_128x128@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 256x256 alber.iconset/icon_256x256.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 512x512 alber.iconset/icon_256x256@2x.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 72 -resize 512x512 alber.iconset/icon_512x512.png
+convert docs/img/mac_icon.ico -alpha on -background none -units PixelsPerInch -density 144 -resize 1024x1024 alber.iconset/icon_512x512@2x.png
 iconutil --convert icns alber.iconset

 # Set up the .app directory
--- a/.github/workflows/Android_Build.yml
+++ b/.github/workflows/Android_Build.yml
@ -8,7 +8,7 @@ on:

 jobs:
  x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    strategy:
      matrix:
@ -73,7 +73,7 @@ jobs:
          ./src/pandroid/app/build/outputs/apk/${{ env.BUILD_TYPE }}/app-${{ env.BUILD_TYPE }}.apk

  arm64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    strategy:
      matrix:
--- a/.github/workflows/HTTP_Build.yml
+++ b/.github/workflows/HTTP_Build.yml
@ -16,10 +16,10 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive
      
--- a/.github/workflows/Hydra_Build.yml
+++ b/.github/workflows/Hydra_Build.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: windows-latest

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -32,18 +32,33 @@ jobs:
    - name: Build
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    
-    - name: Upload core
-      uses: actions/upload-artifact@v2
+    - name: Upload Hydra core
+      uses: actions/upload-artifact@v4
      with:
-        name: Windows core
+        name: Windows Hydra core
        path: '${{github.workspace}}/build/${{ env.BUILD_TYPE }}/Alber.dll'

+    - name: Configure CMake (Again)
+      run: |
+        rm -r -fo ${{github.workspace}}/build 
+        cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DENABLE_USER_BUILD=ON -DBUILD_LIBRETRO_CORE=ON
+
+    - name: Build (Again)
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+    - name: Upload Libretro core
+      uses: actions/upload-artifact@v4
+      with:
+        name: Windows Libretro core
+        path: |
+          ${{github.workspace}}/build/${{ env.BUILD_TYPE }}/panda3ds_libretro.dll
+          ${{github.workspace}}/docs/libretro/panda3ds_libretro.info

  MacOS:
    runs-on: macos-13

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -61,22 +76,38 @@ jobs:
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    
    - name: Upload core
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
-        name: MacOS core
+        name: MacOS Hydra core
        path: '${{github.workspace}}/build/libAlber.dylib'

+    - name: Configure CMake (Again)
+      run: |
+        rm -rf ${{github.workspace}}/build
+        cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DENABLE_USER_BUILD=ON -DBUILD_LIBRETRO_CORE=ON
+
+    - name: Build (Again)
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} && ls -R ${{github.workspace}}/build
+
+    - name: Upload Libretro core
+      uses: actions/upload-artifact@v4
+      with:
+        name: MacOS Libretro core
+        path: |
+          ${{github.workspace}}/build/panda3ds_libretro.dylib
+          ${{github.workspace}}/docs/libretro/panda3ds_libretro.info
+
  Linux:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
      
    - name: Install newer Clang
      run: |
@ -98,22 +129,38 @@ jobs:
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    
    - name: Upload core
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
-        name: Linux core
+        name: Linux Hydra core
        path: '${{github.workspace}}/build/libAlber.so'

+    - name: Configure CMake (Again)
+      run: |
+        rm -rf ${{github.workspace}}/build
+        cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_C_COMPILER=clang-17 -DCMAKE_CXX_COMPILER=clang++-17 -DENABLE_USER_BUILD=ON -DBUILD_LIBRETRO_CORE=ON
+
+    - name: Build (Again)
+      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
+
+    - name: Upload Libretro core
+      uses: actions/upload-artifact@v4
+      with:
+        name: Linux Libretro core
+        path: |
+          ${{github.workspace}}/build/panda3ds_libretro.so
+          ${{github.workspace}}/docs/libretro/panda3ds_libretro.info
+
  Android-x64:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
       
    - name: Setup Vulkan SDK
      uses: humbletim/setup-vulkan-sdk@v1.2.0
@ -129,7 +176,7 @@ jobs:
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
    
    - name: Upload core
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
-        name: Android core
+        name: Android Hydra core
        path: '${{github.workspace}}/build/libAlber.so'
--- a/.github/workflows/Linux_AppImage_Build.yml
+++ b/.github/workflows/Linux_AppImage_Build.yml
@ -16,15 +16,15 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

    - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev

    - name: Install newer Clang
      run: |
@ -33,11 +33,11 @@ jobs:
       sudo ./llvm.sh 17
       
    - name: Setup Vulkan SDK
-      run: |
-       wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-       sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-focal.list http://packages.lunarg.com/vulkan/lunarg-vulkan-focal.list
-       sudo apt update
-       sudo apt install vulkan-sdk
+      uses: humbletim/setup-vulkan-sdk@v1.2.0
+      with:
+        vulkan-query-version: latest
+        vulkan-use-cache: true
+        vulkan-components: Vulkan-Headers, Vulkan-Loader, SPIRV-Tools, Glslang

    - name: Configure CMake
      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
@ -52,7 +52,7 @@ jobs:
      run:  ./.github/linux-appimage.sh

    - name: Upload executable
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: Linux executable
        path: './Alber-x86_64.AppImage' 
--- a/.github/workflows/Linux_Build.yml
+++ b/.github/workflows/Linux_Build.yml
@ -16,15 +16,15 @@ jobs:
    # well on Windows or Mac.  You can convert this to a matrix build if you need
    # cross-platform coverage.
    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

    - name: Install misc packages
-      run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev
+      run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libwayland-dev

    - name: Install newer Clang
      run: |
@ -49,7 +49,7 @@ jobs:
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

    - name: Upload executable
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: Linux executable
        path: './build/Alber'
--- a/.github/workflows/MacOS_Build.yml
+++ b/.github/workflows/MacOS_Build.yml
@ -19,7 +19,7 @@ jobs:
    runs-on: macos-13

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -52,7 +52,7 @@ jobs:
      run: zip -r Alber Alber.app

    - name: Upload MacOS App
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: MacOS Alber App Bundle
        path: 'Alber.zip'
--- a/.github/workflows/Qt_Build.yml
+++ b/.github/workflows/Qt_Build.yml
@ -15,7 +15,7 @@ jobs:
    runs-on: windows-latest

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -45,7 +45,7 @@ jobs:
        windeployqt --dir upload upload/Alber.exe

    - name: Upload executable
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: Windows executable
        path: upload
@ -54,7 +54,7 @@ jobs:
    runs-on: macos-13

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -90,23 +90,22 @@ jobs:
      run: zip -r Alber Alber.app

    - name: Upload MacOS App
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: MacOS Alber App Bundle
        path: 'Alber.zip'

  Linux:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

    - name: Install misc packages
      run: |
-       sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
-       sudo add-apt-repository -y ppa:savoury1/qt-6-2
+       sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
       sudo apt update
       sudo apt install qt6-base-dev qt6-base-private-dev

@ -117,11 +116,11 @@ jobs:
       sudo ./llvm.sh 17
       
    - name: Setup Vulkan SDK
-      run: |
-       wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-       sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-focal.list http://packages.lunarg.com/vulkan/lunarg-vulkan-focal.list
-       sudo apt update
-       sudo apt install vulkan-sdk
+      uses: humbletim/setup-vulkan-sdk@v1.2.0
+      with:
+        vulkan-query-version: latest
+        vulkan-use-cache: true
+        vulkan-components: Vulkan-Headers, Vulkan-Loader, SPIRV-Tools, Glslang

    - name: Configure CMake
      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_C_COMPILER=clang-17 -DCMAKE_CXX_COMPILER=clang++-17 -DENABLE_USER_BUILD=ON -DENABLE_QT_GUI=ON
@ -135,7 +134,7 @@ jobs:
        ./.github/linux-appimage-qt.sh

    - name: Upload executable
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: Linux executable
        path: './Alber-x86_64.AppImage' 
--- a/.github/workflows/Windows_Build.yml
+++ b/.github/workflows/Windows_Build.yml
@ -19,7 +19,7 @@ jobs:
    runs-on: windows-latest

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Fetch submodules
      run: git submodule update --init --recursive

@ -40,7 +40,7 @@ jobs:
      run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}

    - name: Upload executable
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v4
      with:
        name: Windows executable
        path: './build/${{ env.BUILD_TYPE }}/Alber.exe'
--- a/.gitignore
+++ b/.gitignore
@ -64,5 +64,9 @@ fb.bat
 *.elf
 *.smdh

+# Compiled Metal shader files
+*.ir
+*.metallib
+
 config.toml
 CMakeSettings.json
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,130 @@
+# DESCRIPTION: GitLab CI/CD for libRetro (NOT FOR GitLab-proper)
+
+##############################################################################
+################################# BOILERPLATE ################################
+##############################################################################
+
+# Core definitions
+.core-defs:
+  variables:
+    GIT_SUBMODULE_STRATEGY: recursive
+    CORENAME: panda3ds
+    CORE_ARGS: -DBUILD_LIBRETRO_CORE=ON -DENABLE_USER_BUILD=ON -DENABLE_VULKAN=OFF -DENABLE_LUAJIT=OFF -DENABLE_DISCORD_RPC=OFF
+
+# Inclusion templates, required for the build to work
+
+include:
+  ################################## DESKTOPS ################################
+  # Linux
+  - project: 'libretro-infrastructure/ci-templates'
+    file: '/linux-cmake.yml'
+
+  # Windows
+  - project: 'libretro-infrastructure/ci-templates'
+    file: '/windows-cmake-mingw.yml'
+
+  # MacOS
+  - project: 'libretro-infrastructure/ci-templates'
+    file: 'osx-cmake-x86.yml'
+
+  # MacOS
+  - project: 'libretro-infrastructure/ci-templates'
+    file: 'osx-cmake-arm64.yml'
+
+  ################################## CELLULAR ################################
+  # Android
+  - project: 'libretro-infrastructure/ci-templates'
+    file: '/android-cmake.yml'
+
+  # iOS
+  - project: 'libretro-infrastructure/ci-templates'
+    file: '/ios-cmake.yml'
+
+# Stages for building
+stages:
+  - build-prepare
+  - build-static
+  - build-shared
+
+##############################################################################
+#################################### STAGES ##################################
+##############################################################################
+#
+################################### DESKTOPS #################################
+# Linux 64-bit
+libretro-build-linux-x64:
+  image: $CI_SERVER_HOST:5050/libretro-infrastructure/libretro-build-amd64-ubuntu:latest
+  before_script:
+    - export NUMPROC=$(($(nproc)/5))
+    - sudo apt-get update -qy
+    - sudo apt-get install -qy software-properties-common
+    - sudo add-apt-repository -y ppa:savoury1/build-tools
+    - sudo add-apt-repository -y ppa:savoury1/gcc-defaults-12
+    - sudo apt-get update -qy
+    - sudo apt-get install -qy cmake gcc-12 g++-12
+  variables:
+    CC: /usr/bin/gcc-12
+    CXX: /usr/bin/g++-12
+  extends:
+    - .libretro-linux-cmake-x86_64
+    - .core-defs
+
+# Windows 64-bit
+libretro-build-windows-x64:
+  extends:
+    - .libretro-windows-cmake-x86_64
+    - .core-defs
+
+# MacOS 64-bit
+libretro-build-osx-x64:
+  tags:
+    - mac-apple-silicon
+  variables:
+    CORE_ARGS: -DBUILD_LIBRETRO_CORE=ON -DENABLE_USER_BUILD=ON -DENABLE_VULKAN=OFF -DENABLE_LUAJIT=OFF -DENABLE_DISCORD_RPC=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCRYPTOPP_AMD64=1
+  extends:
+    - .libretro-osx-cmake-x86
+    - .core-defs
+
+# MacOS arm 64-bit
+libretro-build-osx-arm64:
+  tags:
+    - mac-apple-silicon
+  extends:
+    - .libretro-osx-cmake-arm64
+    - .core-defs
+
+################################### CELLULAR #################################
+# Android ARMv7a
+#android-armeabi-v7a:
+#  extends:
+#    - .libretro-android-cmake-armeabi-v7a
+#    - .core-defs
+
+# Android ARMv8a
+# android-arm64-v8a:
+#   extends:
+#     - .libretro-android-cmake-arm64-v8a
+#     - .core-defs
+
+# Android 64-bit x86
+# android-x86_64:
+#   extends:
+#     - .libretro-android-cmake-x86_64
+#     - .core-defs
+
+# Android 32-bit x86
+# android-x86:
+#   extends:
+#     - .libretro-android-cmake-x86
+#     - .core-defs
+
+# iOS
+# libretro-build-ios-arm64:
+#   extends:
+#     - .libretro-ios-cmake-arm64
+#     - .core-defs
+#   variables:
+#     CORE_ARGS: -DBUILD_LIBRETRO_CORE=ON -DBUILD_PLAY=OFF -DENABLE_AMAZON_S3=off -DBUILD_TESTS=OFF -DCMAKE_TOOLCHAIN_FILE=deps/Dependencies/cmake-ios/ios.cmake -DTARGET_IOS=ON
+#     LIBNAME: ${CORENAME}_libretro_ios.dylib
+
+################################### CONSOLES #################################
--- a/.gitmodules
+++ b/.gitmodules
@ -70,3 +70,15 @@
 [submodule "third_party/capstone"]
 	path = third_party/capstone
 	url = https://github.com/capstone-engine/capstone
+[submodule "third_party/hips"]
+	path = third_party/hips
+	url = https://github.com/wheremyfoodat/Hips
+[submodule "third_party/metal-cpp"]
+	path = third_party/metal-cpp
+	url = https://github.com/Panda3DS-emu/metal-cpp
+[submodule "third_party/fmt"]
+	path = third_party/fmt
+	url = https://github.com/fmtlib/fmt
+[submodule "third_party/fdk-aac"]
+	path = third_party/fdk-aac
+	url = https://github.com/Panda3DS-emu/fdk-aac/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -19,19 +19,37 @@ endif()

 project(Alber)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

 if(APPLE)
    enable_language(OBJC)
 endif()

+# Enable RC support in order to use resource files for application icons
+if(WIN32)
+    enable_language(RC)
+    set(APP_RESOURCES docs/img/windows_icon.rc)
+endif()
+
 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security")
-endif() 
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security -Wno-invalid-offsetof")
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-interference-size")
+endif()
+
+if(ANDROID)
+    set(DEFAULT_OPENGL_PROFILE OpenGLES)
+else()
+    set(DEFAULT_OPENGL_PROFILE OpenGL)
+endif()

 option(DISABLE_PANIC_DEV "Make a build with fewer and less intrusive asserts" ON)
 option(GPU_DEBUG_INFO "Enable additional GPU debugging info" OFF)
 option(ENABLE_OPENGL "Enable OpenGL rendering backend" ON)
 option(ENABLE_VULKAN "Enable Vulkan rendering backend" ON)
+option(ENABLE_METAL "Enable Metal rendering backend (if available)" ON)
 option(ENABLE_LTO "Enable link-time optimization" OFF)
 option(ENABLE_TESTS "Compile unit-tests" OFF)
 option(ENABLE_USER_BUILD "Make a user-facing build. These builds have various assertions disabled, LTO, and more" OFF)
@ -39,12 +57,65 @@ option(ENABLE_HTTP_SERVER "Enable HTTP server. Used for Discord bot support" OFF
 option(ENABLE_DISCORD_RPC "Compile with Discord RPC support (disabled by default)" ON)
 option(ENABLE_LUAJIT "Enable scripting with the Lua programming language" ON)
 option(ENABLE_QT_GUI "Enable the Qt GUI. If not selected then the emulator uses a minimal SDL-based UI instead" OFF)
+option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON)
 option(BUILD_HYDRA_CORE "Build a Hydra core" OFF)
+option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF)
+option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON)
+option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF)
+
+set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.")
+set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES)
+
+if(ENABLE_OPENGL AND (OPENGL_PROFILE STREQUAL "OpenGLES"))
+    message(STATUS "Building with OpenGLES support")
+    add_compile_definitions(USING_GLES)
+endif()

 if(BUILD_HYDRA_CORE)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif()

+if(BUILD_LIBRETRO_CORE)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    add_compile_definitions(__LIBRETRO__)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND ENABLE_USER_BUILD)
+    # Disable stack buffer overflow checks in user builds
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /GS-")
+endif()
+
+# Generate versioning files
+find_package(Git)
+set(PANDA3DS_VERSION "0.8")
+
+if(NOT EXISTS ${CMAKE_BINARY_DIR}/include/version.hpp.in)
+    file(WRITE ${CMAKE_BINARY_DIR}/include/version.hpp.in "#define PANDA3DS_VERSION \"\${PANDA3DS_VERSION}\"")
+endif()
+
+if(GIT_FOUND AND ENABLE_GIT_VERSIONING)
+    execute_process(
+        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0
+        OUTPUT_VARIABLE PANDA3DS_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMAND ${GIT_EXECUTABLE} describe --tags
+        OUTPUT_VARIABLE git_version_tag OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT PANDA3DS_VERSION STREQUAL git_version_tag)
+      execute_process(
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMAND ${GIT_EXECUTABLE} describe --always --abbrev=7
+          OUTPUT_VARIABLE git_version_rev OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+      set(PANDA3DS_VERSION "${PANDA3DS_VERSION}.${git_version_rev}")
+      unset(git_version_rev)
+    endif()
+    string(REGEX REPLACE "^v" "" PANDA3DS_VERSION "${PANDA3DS_VERSION}")
+    unset(git_version_tag)
+endif()
+configure_file(${CMAKE_BINARY_DIR}/include/version.hpp.in ${CMAKE_BINARY_DIR}/include/version.hpp)
+include_directories(${CMAKE_BINARY_DIR}/include/)
+
 add_library(AlberCore STATIC)

 include_directories(${PROJECT_SOURCE_DIR}/include/)
@ -52,6 +123,7 @@ include_directories(${PROJECT_SOURCE_DIR}/include/kernel)
 include_directories(${FMT_INCLUDE_DIR})
 include_directories(third_party/boost/)
 include_directories(third_party/elfio/)
+include_directories(third_party/hips/include/)
 include_directories(third_party/imgui/)
 include_directories(third_party/dynarmic/src)
 include_directories(third_party/cryptopp/)
@ -82,10 +154,13 @@ if (NOT ANDROID)
    target_link_libraries(AlberCore PUBLIC SDL2-static)
 endif()

+add_subdirectory(third_party/fmt)
 add_subdirectory(third_party/toml11)
 include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 include_directories(third_party/glm)
+include_directories(third_party/renderdoc)
+include_directories(third_party/duckstation)

 add_subdirectory(third_party/cmrc)

@ -144,6 +219,18 @@ else()
    set(HOST_ARM64 FALSE)
 endif()

+# Enable SSE4.1 if it's not explicitly disabled
+# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+endif()
+
+if(ENABLE_RENDERDOC_API)
+    find_package(RenderDoc 1.6.0 MODULE REQUIRED)
+    add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC)
+endif()
+
 if(HOST_X64 OR HOST_ARM64)
    set(DYNARMIC_TESTS OFF)
    #set(DYNARMIC_NO_BUNDLED_FMT ON)
@ -155,6 +242,7 @@ else()
 endif()

 add_subdirectory(third_party/teakra EXCLUDE_FROM_ALL)
+add_subdirectory(third_party/fdk-aac)

 set(CAPSTONE_ARCHITECTURE_DEFAULT OFF)
 set(CAPSTONE_ARM_SUPPORT ON)
@ -166,7 +254,7 @@ set(SOURCE_FILES src/emulator.cpp src/io_file.cpp src/config.cpp
                 src/core/CPU/cpu_dynarmic.cpp src/core/CPU/dynarmic_cycles.cpp
                 src/core/memory.cpp src/renderer.cpp src/core/renderer_null/renderer_null.cpp
                 src/http_server.cpp src/stb_image_write.c src/core/cheats.cpp src/core/action_replay.cpp
-                 src/discord_rpc.cpp src/lua.cpp src/memory_mapped_file.cpp src/miniaudio.cpp
+                 src/discord_rpc.cpp src/lua.cpp src/memory_mapped_file.cpp src/miniaudio.cpp src/renderdoc.cpp
 )
 set(CRYPTO_SOURCE_FILES src/core/crypto/aes_engine.cpp)
 set(KERNEL_SOURCE_FILES src/core/kernel/kernel.cpp src/core/kernel/resource_limits.cpp
@ -187,12 +275,13 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
                         src/core/services/act.cpp src/core/services/nfc.cpp src/core/services/dlp_srvr.cpp
                         src/core/services/ir_user.cpp src/core/services/http.cpp src/core/services/soc.cpp
                         src/core/services/ssl.cpp src/core/services/news_u.cpp src/core/services/amiibo_device.cpp
-                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp
+                         src/core/services/csnd.cpp src/core/services/nwm_uds.cpp src/core/services/fonts.cpp
 )
 set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
                      src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
                      src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
-                      src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
+                      src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
+                      src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
 )

 set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@ -205,7 +294,7 @@ set(APPLET_SOURCE_FILES src/core/applets/applet.cpp src/core/applets/mii_selecto
                        src/core/applets/error_applet.cpp
 )
 set(AUDIO_SOURCE_FILES src/core/audio/dsp_core.cpp src/core/audio/null_core.cpp src/core/audio/teakra_core.cpp
-                       src/core/audio/miniaudio_device.cpp src/core/audio/hle_core.cpp
+                       src/core/audio/miniaudio_device.cpp src/core/audio/hle_core.cpp src/core/audio/aac_decoder.cpp
 )
 set(RENDERER_SW_SOURCE_FILES src/core/renderer_sw/renderer_sw.cpp)

@ -224,7 +313,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                 include/services/mic.hpp include/services/cecd.hpp include/services/ac.hpp
                 include/services/am.hpp include/services/boss.hpp include/services/frd.hpp include/services/nim.hpp
                 include/fs/archive_ext_save_data.hpp include/fs/archive_ncch.hpp include/services/mcu/mcu_hwc.hpp
-                 include/colour.hpp include/services/y2r.hpp include/services/cam.hpp include/services/ssl.hpp 
+                 include/colour.hpp include/services/y2r.hpp include/services/cam.hpp include/services/ssl.hpp
                 include/services/ldr_ro.hpp include/ipc.hpp include/services/act.hpp include/services/nfc.hpp
                 include/system_models.hpp include/services/dlp_srvr.hpp include/PICA/dynapica/pica_recs.hpp
                 include/PICA/dynapica/x64_regs.hpp include/PICA/dynapica/vertex_loader_rec.hpp include/PICA/dynapica/shader_rec.hpp
@ -235,21 +324,24 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                 include/config.hpp include/services/ir_user.hpp include/http_server.hpp include/cheats.hpp
                 include/action_replay.hpp include/renderer_sw/renderer_sw.hpp include/compiler_builtins.hpp
                 include/fs/romfs.hpp include/fs/ivfc.hpp include/discord_rpc.hpp include/services/http.hpp include/result/result_cfg.hpp
-                 include/applets/applet.hpp include/applets/mii_selector.hpp include/math_util.hpp include/services/soc.hpp 
+                 include/applets/applet.hpp include/applets/mii_selector.hpp include/math_util.hpp include/services/soc.hpp
                 include/services/news_u.hpp include/applets/software_keyboard.hpp include/applets/applet_manager.hpp include/fs/archive_user_save_data.hpp
                 include/services/amiibo_device.hpp include/services/nfc_types.hpp include/swap.hpp include/services/csnd.hpp include/services/nwm_uds.hpp
                 include/fs/archive_system_save_data.hpp include/lua_manager.hpp include/memory_mapped_file.hpp include/hydra_icon.hpp
-                 include/PICA/dynapica/shader_rec_emitter_arm64.hpp include/scheduler.hpp include/applets/error_applet.hpp
+                 include/PICA/dynapica/shader_rec_emitter_arm64.hpp include/scheduler.hpp include/applets/error_applet.hpp include/PICA/shader_gen.hpp
                 include/audio/dsp_core.hpp include/audio/null_core.hpp include/audio/teakra_core.hpp
                 include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
-                 include/audio/hle_core.hpp include/capstone.hpp
+                 include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
+                 include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
+                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
+                 include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
 )

 cmrc_add_resource_library(
    resources_console_fonts
    NAMESPACE ConsoleFonts
    WHENCE "src/core/services/fonts/"
-    "src/core/services/fonts/CitraSharedFontUSRelocated.bin"
+    "src/core/services/fonts/SharedFontReplacement.bin"
 )

 set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
@ -275,7 +367,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
 endif()

 if(ENABLE_QT_GUI)
-    include_directories(third_party/duckstation)
    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)

    if(APPLE)
@ -308,7 +399,7 @@ if(ENABLE_OPENGL)
    set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
        include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
        include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
-        include/renderer_gl/gl_state.hpp
+        include/renderer_gl/gl_state.hpp include/renderer_gl/gl_driver.hpp
    )

    set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
@ -318,6 +409,8 @@ if(ENABLE_OPENGL)
        src/host_shaders/opengl_fragment_shader.frag
    )

+    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
+
    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
    source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})

@ -338,8 +431,8 @@ endif()

 if(ENABLE_VULKAN)
    find_package(
-        Vulkan 1.3.206 REQUIRED
-        COMPONENTS glslangValidator
+        Vulkan REQUIRED
+        COMPONENTS glslang
    )

    set(RENDERER_VK_INCLUDE_FILES include/renderer_vk/renderer_vk.hpp
@ -382,7 +475,7 @@ if(ENABLE_VULKAN)
        add_custom_command(
            OUTPUT ${HOST_SHADER_SPIRV}
            COMMAND ${CMAKE_COMMAND} -E make_directory "${PROJECT_BINARY_DIR}/host_shaders/"
-            COMMAND Vulkan::glslangValidator ${RENDERER_VK_HOST_SHADERS_FLAGS} -V "${PROJECT_SOURCE_DIR}/${HOST_SHADER_SOURCE}" -o ${HOST_SHADER_SPIRV}
+            COMMAND glslang ${RENDERER_VK_HOST_SHADERS_FLAGS} -V "${PROJECT_SOURCE_DIR}/${HOST_SHADER_SOURCE}" -o ${HOST_SHADER_SPIRV}
            DEPENDS ${HOST_SHADER_SOURCE}
        )
        list( APPEND RENDERER_VK_HOST_SHADERS_SPIRV ${HOST_SHADER_SPIRV} )
@ -400,14 +493,88 @@ if(ENABLE_VULKAN)
    target_link_libraries(AlberCore PRIVATE Vulkan::Vulkan resources_renderer_vk)
 endif()

+if(ENABLE_METAL AND APPLE)
+    set(RENDERER_MTL_INCLUDE_FILES include/renderer_mtl/renderer_mtl.hpp
+        include/renderer_mtl/mtl_depth_stencil_cache.hpp
+        include/renderer_mtl/mtl_blit_pipeline_cache.hpp
+        include/renderer_mtl/mtl_draw_pipeline_cache.hpp
+        include/renderer_mtl/mtl_render_target.hpp
+        include/renderer_mtl/mtl_texture.hpp
+        include/renderer_mtl/mtl_vertex_buffer_cache.hpp
+        include/renderer_mtl/mtl_lut_texture.hpp
+        include/renderer_mtl/mtl_command_encoder.hpp
+        include/renderer_mtl/mtl_common.hpp
+        include/renderer_mtl/pica_to_mtl.hpp
+        include/renderer_mtl/objc_helper.hpp
+    )
+
+    set(RENDERER_MTL_SOURCE_FILES src/core/renderer_mtl/metal_cpp_impl.cpp
+        src/core/renderer_mtl/renderer_mtl.cpp
+        src/core/renderer_mtl/mtl_texture.cpp
+        src/core/renderer_mtl/mtl_etc1.cpp
+        src/core/renderer_mtl/mtl_lut_texture.cpp
+        src/core/renderer_mtl/objc_helper.mm
+        src/host_shaders/metal_shaders.metal
+        src/host_shaders/metal_blit.metal
+        #src/host_shaders/metal_copy_to_lut_texture.metal
+    )
+
+    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_MTL_INCLUDE_FILES})
+    source_group("Source Files\\Core\\Metal Renderer" FILES ${RENDERER_MTL_SOURCE_FILES})
+
+    set(RENDERER_MTL_HOST_SHADERS_SOURCES)
+    function (add_metal_shader SHADER)
+        set(SHADER_SOURCE "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.metal")
+        set(SHADER_IR "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.ir")
+        set(SHADER_METALLIB "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.metallib")
+        # TODO: only include sources in debug builds
+        add_custom_command(
+            OUTPUT ${SHADER_IR}
+            COMMAND xcrun -sdk macosx metal -gline-tables-only -frecord-sources -o ${SHADER_IR} -c ${SHADER_SOURCE}
+            DEPENDS ${SHADER_SOURCE}
+            VERBATIM)
+        add_custom_command(
+            OUTPUT ${SHADER_METALLIB}
+            COMMAND xcrun -sdk macosx metallib -o ${SHADER_METALLIB} ${SHADER_IR}
+            DEPENDS ${SHADER_IR}
+            VERBATIM)
+        set(RENDERER_MTL_HOST_SHADERS_SOURCES ${RENDERER_MTL_HOST_SHADERS_SOURCES} ${SHADER_METALLIB})
+    endfunction()
+
+    add_metal_shader(metal_shaders)
+    add_metal_shader(metal_blit)
+    #add_metal_shader(metal_copy_to_lut_texture)
+
+    add_custom_target(
+        compile_msl_shaders
+        DEPENDS ${RENDERER_MTL_HOST_SHADERS_SOURCES}
+    )
+
+    cmrc_add_resource_library(
+        resources_renderer_mtl
+        NAMESPACE RendererMTL
+        WHENCE "src/host_shaders/"
+        "src/host_shaders/metal_shaders.metallib"
+        "src/host_shaders/metal_blit.metallib"
+        #"src/host_shaders/metal_copy_to_lut_texture.metallib"
+    )
+    add_dependencies(resources_renderer_mtl compile_msl_shaders)
+
+    target_sources(AlberCore PRIVATE ${RENDERER_MTL_SOURCE_FILES})
+    target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_METAL=1")
+    target_include_directories(AlberCore PRIVATE third_party/metal-cpp)
+    # TODO: check if all of them are needed
+    target_link_libraries(AlberCore PRIVATE "-framework Metal" "-framework Foundation" "-framework QuartzCore" resources_renderer_mtl)
+endif()
+
 source_group("Header Files\\Core" FILES ${HEADER_FILES})
-set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} 
+set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES}
    ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES} ${APPLET_SOURCE_FILES} ${RENDERER_SW_SOURCE_FILES} ${PICA_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES}
    ${AUDIO_SOURCE_FILES} ${HEADER_FILES} ${FRONTEND_HEADER_FILES})
 target_sources(AlberCore PRIVATE ${ALL_SOURCES})

-target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra)
-target_link_libraries(AlberCore PUBLIC glad capstone)
+target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra fdk-aac)
+target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)

 if(ENABLE_DISCORD_RPC AND NOT ANDROID)
    target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")
@ -438,7 +605,7 @@ else()
    target_compile_definitions(AlberCore PUBLIC "PANDA3DS_FRONTEND_SDL=1")
 endif()

-if(NOT BUILD_HYDRA_CORE)
+if(NOT BUILD_HYDRA_CORE AND NOT BUILD_LIBRETRO_CORE)
    add_executable(Alber)

    if(ENABLE_QT_GUI)
@ -447,11 +614,16 @@ if(NOT BUILD_HYDRA_CORE)
            message(FATAL_ERROR "Qt frontend requires OpenGL")
        endif()

+        option(GENERATE_QT_TRANSLATION "Generate Qt translation file" OFF)
+        set(QT_LANGUAGES docs/translations)
+
        set(FRONTEND_SOURCE_FILES src/panda_qt/main.cpp src/panda_qt/screen.cpp src/panda_qt/main_window.cpp src/panda_qt/about_window.cpp
            src/panda_qt/config_window.cpp src/panda_qt/zep.cpp src/panda_qt/text_editor.cpp src/panda_qt/cheats_window.cpp src/panda_qt/mappings.cpp
-        )   
+            src/panda_qt/patch_window.cpp src/panda_qt/elided_label.cpp src/panda_qt/shader_editor.cpp
+        )
        set(FRONTEND_HEADER_FILES include/panda_qt/screen.hpp include/panda_qt/main_window.hpp include/panda_qt/about_window.hpp
            include/panda_qt/config_window.hpp include/panda_qt/text_editor.hpp include/panda_qt/cheats_window.hpp
+            include/panda_qt/patch_window.hpp include/panda_qt/elided_label.hpp include/panda_qt/shader_editor.hpp
        )

        source_group("Source Files\\Qt" FILES ${FRONTEND_SOURCE_FILES})
@ -481,27 +653,47 @@ if(NOT BUILD_HYDRA_CORE)
            endif()
        endif()

+        # Generates an en.ts file for translations
+        # To update the file, use cmake --build --target Alber_lupdate
+        if(GENERATE_QT_TRANSLATION)
+            find_package(Qt6 REQUIRED COMPONENTS LinguistTools)
+            qt_add_lupdate(Alber TS_FILES ${QT_LANGUAGES}/en.ts
+                SOURCES ${FRONTEND_SOURCE_FILES}
+                INCLUDE_DIRECTORIES ${FRONTEND_HEADER_FILES}
+                NO_GLOBAL_TARGET
+            )
+        endif()
+
        qt_add_resources(AlberCore "app_images"
            PREFIX "/"
            FILES
-                docs/img/rsob_icon.png docs/img/rstarstruck_icon.png
+                docs/img/rsob_icon.png docs/img/rstarstruck_icon.png docs/img/rpog_icon.png docs/img/rsyn_icon.png
        )
    else()
        set(FRONTEND_SOURCE_FILES src/panda_sdl/main.cpp src/panda_sdl/frontend_sdl.cpp src/panda_sdl/mappings.cpp)
-        set(FRONTEND_HEADER_FILES "")
+        set(FRONTEND_HEADER_FILES "include/panda_sdl/frontend_sdl.hpp")
    endif()

    target_link_libraries(Alber PRIVATE AlberCore)
-    target_sources(Alber PRIVATE ${FRONTEND_SOURCE_FILES} ${FRONTEND_HEADER_FILES})
+    target_sources(Alber PRIVATE ${FRONTEND_SOURCE_FILES} ${FRONTEND_HEADER_FILES} ${APP_RESOURCES})
 elseif(BUILD_HYDRA_CORE)
    target_compile_definitions(AlberCore PRIVATE PANDA3DS_HYDRA_CORE=1)
    include_directories(third_party/hydra_core/include)
    add_library(Alber SHARED src/hydra_core.cpp)
    target_link_libraries(Alber PUBLIC AlberCore)
+elseif(BUILD_LIBRETRO_CORE)
+    include_directories(third_party/libretro/include)
+    add_library(panda3ds_libretro SHARED src/libretro_core.cpp)
+    target_link_libraries(panda3ds_libretro PUBLIC AlberCore)
+    set_target_properties(panda3ds_libretro PROPERTIES PREFIX "")
 endif()

 if(ENABLE_LTO OR ENABLE_USER_BUILD)
-    set_target_properties(Alber PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+    if (NOT BUILD_LIBRETRO_CORE)
+        set_target_properties(Alber PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        set_target_properties(panda3ds_libretro PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+    endif()
 endif()

 if(ENABLE_TESTS)
--- a/cmake/FindRenderDoc.cmake
+++ b/cmake/FindRenderDoc.cmake
@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+set(RENDERDOC_INCLUDE_DIR third_party/renderdoc)
+
+if (RENDERDOC_INCLUDE_DIR AND EXISTS "${RENDERDOC_INCLUDE_DIR}/renderdoc_app.h")
+    file(STRINGS "${RENDERDOC_INCLUDE_DIR}/renderdoc_app.h" RENDERDOC_VERSION_LINE REGEX "typedef struct RENDERDOC_API")
+    string(REGEX REPLACE ".*typedef struct RENDERDOC_API_([0-9]+)_([0-9]+)_([0-9]+).*" "\\1.\\2.\\3" RENDERDOC_VERSION "${RENDERDOC_VERSION_LINE}")
+    unset(RENDERDOC_VERSION_LINE)
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(RenderDoc
+    REQUIRED_VARS RENDERDOC_INCLUDE_DIR
+    VERSION_VAR RENDERDOC_VERSION
+)
+
+if (RenderDoc_FOUND AND NOT TARGET RenderDoc::API)
+    add_library(RenderDoc::API INTERFACE IMPORTED)
+    set_target_properties(RenderDoc::API PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${RENDERDOC_INCLUDE_DIR}"
+    )
+endif()
+
+mark_as_advanced(RENDERDOC_INCLUDE_DIR)
--- a/docs/3ds/accelerometer_readings/readings_flat_1.png
+++ b/docs/3ds/accelerometer_readings/readings_flat_1.png
--- a/docs/3ds/accelerometer_readings/readings_flat_2.png
+++ b/docs/3ds/accelerometer_readings/readings_flat_2.png
--- a/docs/3ds/accelerometer_readings/readings_shaking_1.png
+++ b/docs/3ds/accelerometer_readings/readings_shaking_1.png
--- a/docs/3ds/accelerometer_readings/readings_shaking_2.png
+++ b/docs/3ds/accelerometer_readings/readings_shaking_2.png
--- a/docs/3ds/lighting.md
+++ b/docs/3ds/lighting.md
@ -0,0 +1,79 @@
+## Info on the lighting implementation
+
+### Missing shadow attenuation
+Shadow attenuation samples a texture unit, and that likely needs render to texture for most games so that they can construct
+their shadow map. As such the colors are not multiplied by the shadow attenuation value, so there's no shadows.
+
+### Missing bump mapping
+Bump mapping also samples a texture unit, most likely doesn't need render to texture however may need better texture sampling
+implementation (such as GPUREG_TEXUNITi_BORDER_COLOR, GPUREG_TEXUNITi_BORDER_PARAM). Bump mapping would work for some things,
+namely the 3ds-examples bump mapping demo, but would break others such as Toad Treasure Tracker with a naive `texture` implementation.
+
+Also the CP configuration is missing, because it needs a tangent map implementation. It is currently marked with error_unimpl.
+
+### samplerEnabledBitfields
+Holds the enabled state of the lighting samples for various PICA configurations
+As explained in https://www.3dbrew.org/wiki/GPU/Internal_Registers#GPUREG_LIGHTING_CONFIG0
+
+```c
+const bool samplerEnabled[9 * 7] = bool[9 * 7](
+	// D0     D1     SP     FR     RB     RG     RR
+	true,  false, true,  false, false, false, true,  // Configuration 0: D0, SP, RR
+	false, false, true,  true,  false, false, true,  // Configuration 1: FR, SP, RR
+	true,  true,  false, false, false, false, true,  // Configuration 2: D0, D1, RR
+	true,  true,  false, true,  false, false, false, // Configuration 3: D0, D1, FR
+	true,  true,  true,  false, true,  true,  true,  // Configuration 4: All except for FR
+	true,  false, true,  true,  true,  true,  true,  // Configuration 5: All except for D1
+	true,  true,  true,  true,  false, false, true,  // Configuration 6: All except for RB and RG
+	false, false, false, false, false, false, false, // Configuration 7: Unused
+	true,  true,  true,  true,  true,  true,  true   // Configuration 8: All
+);
+```
+
+The above has been condensed to two uints for performance reasons.
+You can confirm they are the same by running the following:
+```c
+const uint samplerEnabledBitfields[2] = { 0x7170e645u, 0x7f013fefu };
+for (int i = 0; i < 9 * 7; i++) {
+	unsigned arrayIndex = (i >> 5);
+	bool b = (samplerEnabledBitfields[arrayIndex] & (1u << (i & 31))) != 0u;
+	if (samplerEnabled[i] == b) {
+		printf("%d: happy\n", i);
+	} else {
+		printf("%d: unhappy\n", i);
+	}
+}
+```
+
+### lightLutLookup
+lut_id is one of these values
+0 	D0
+1 	D1
+2 	SP
+3 	FR
+4 	RB
+5 	RG
+6 	RR 
+
+lut_index on the other hand represents the actual index of the LUT in the texture
+u_tex_luts has 24 LUTs for lighting and they are used like so:
+0 		D0
+1 		D1
+2 		is missing because SP uses LUTs 8-15
+3 		FR
+4 		RB
+5 		RG
+6 		RR
+8-15 	SP0-7
+16-23 	DA0-7, but this is not handled in this function as the lookup is a bit different
+
+The light environment configuration controls which LUTs are available for use
+If a LUT is not available in the selected configuration, its value will always read a constant 1.0 regardless of the enable state in GPUREG_LIGHTING_CONFIG1
+If RR is enabled but not RG or RB, the output of RR is used for the three components; Red, Green and Blue.
+
+### Distance attenuation
+Distance attenuation is computed differently from the other factors, for example
+it doesn't store its scale in GPUREG_LIGHTING_LUTINPUT_SCALE and it doesn't use 
+GPUREG_LIGHTING_LUTINPUT_SELECT. Instead, it uses the distance from the light to the
+fragment and the distance attenuation scale and bias to calculate where in the LUT to look up.
+See: https://www.3dbrew.org/wiki/GPU/Internal_Registers#GPUREG_LIGHTi_ATTENUATION_SCALE
--- a/docs/img/KirbyAndroid.png
+++ b/docs/img/KirbyAndroid.png
--- a/docs/img/alber-icon.ico
+++ b/docs/img/alber-icon.ico
--- a/docs/img/rpog_icon.png
+++ b/docs/img/rpog_icon.png
--- a/docs/img/rsyn_icon.png
+++ b/docs/img/rsyn_icon.png
--- a/docs/img/windows_alt_icon.ico
+++ b/docs/img/windows_alt_icon.ico
--- a/docs/img/windows_icon.ico
+++ b/docs/img/windows_icon.ico
--- a/docs/img/windows_icon.rc
+++ b/docs/img/windows_icon.rc
@ -0,0 +1 @@
+AlberIcon ICON "windows_icon.ico"
--- a/docs/libretro/panda3ds_libretro.info
+++ b/docs/libretro/panda3ds_libretro.info
@ -0,0 +1,34 @@
+# Software Information
+display_name = "Nintendo - 3DS (Panda3DS)"
+authors = "Panda3DS Authors (tm)"
+supported_extensions = "3ds|3dsx|elf|axf|cci|cxi|app"
+corename = "Panda3DS"
+categories = "Emulator"
+license = "GPLv3"
+permissions = ""
+display_version = "Git"
+
+# Hardware Information
+manufacturer = "Nintendo"
+systemname = "3DS"
+systemid = "3ds"
+
+# Libretro Information
+database = "Nintendo - Nintendo 3DS"
+supports_no_game = "false"
+savestate = "true"
+savestate_features = "basic"
+cheats = "false"
+input_descriptors = "true"
+memory_descriptors = "false"
+libretro_saves = "true"
+core_options = "true"
+core_options_version = "1.0"
+load_subsystem = "false"
+hw_render = "true"
+required_hw_api = "OpenGL Core >= 4.1"
+needs_fullpath = "true"
+disk_control = "false"
+is_experimental = "true"
+
+description = "Panda3DS !"
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@ -0,0 +1,45 @@
+#pragma once
+
+#include <array>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+			u32 stride;
+
+			u8 type;
+			u8 componentCount;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
+		u8* indexBuffer;
+
+		// Minimum and maximum index in the index buffer for a draw call
+		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
+		u32 totalLoaderCount;
+		u32 enabledAttributeMask;
+		u32 fixedAttributes;
+		u32 vertexDataSize;
+
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;
+
+		bool canBeAccelerated;
+		bool indexed;
+		bool useShortIndices;
+	};
+}  // namespace PICA
--- a/include/PICA/dynapica/pica_recs.hpp
+++ b/include/PICA/dynapica/pica_recs.hpp
@ -2,7 +2,7 @@
 #include "helpers.hpp"
 #include "vertex_loader_rec.hpp"

-// Common file for our PICA JITs (From vertex config -> CPU assembly and from PICA shader -> CPU assembly)
+// Common file for our PICA JITs (From PICA shader -> CPU assembly)

 namespace Dynapica {
 #ifdef PANDA3DS_DYNAPICA_SUPPORTED
--- a/include/PICA/dynapica/shader_rec.hpp
+++ b/include/PICA/dynapica/shader_rec.hpp
@ -22,8 +22,11 @@ class ShaderJIT {

 	ShaderCache cache;
 #endif
+	bool accurateMul = false;

  public:
+	void setAccurateMul(bool value) { accurateMul = value; }
+
 #ifdef PANDA3DS_SHADER_JIT_SUPPORTED
 	// Call this before starting to process a batch of vertices
 	// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
@ -36,11 +39,11 @@ class ShaderJIT {
 	static constexpr bool isAvailable() { return true; }
 #else
 	void prepare(PICAShader& shaderUnit) {
-		Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit");
+		Helpers::panic("Shader JIT: Tried to run ShaderJIT::Prepare on platform that does not support shader jit");
 	}

 	void run(PICAShader& shaderUnit) {
-		Helpers::panic("Vertex Loader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit");
+		Helpers::panic("Shader JIT: Tried to run ShaderJIT::Run on platform that does not support shader jit");
 	}

 	// Define dummy callback. This should never be called if the shader JIT is not supported
--- a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
@ -37,6 +37,8 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
 	// Shows whether the loaded shader has any log2 and exp2 instructions
 	bool codeHasLog2 = false;
 	bool codeHasExp2 = false;
+	// Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul
+	bool useSafeMUL = false;

 	oaknut::Label log2Func, exp2Func;
 	oaknut::Label emitLog2Func();
@ -123,7 +125,7 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
 	PrologueCallback prologueCb = nullptr;

 	// Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer
-	ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}
+	ShaderEmitter(bool useSafeMUL) : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()), useSafeMUL(useSafeMUL) {}

 	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
 	InstructionCallback getInstructionCallback(u32 pc) { return getLabelPointer<InstructionCallback>(instructionLabels.at(pc)); }
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -32,6 +32,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	Label negateVector;
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	Label onesVector;
+	// Vector value of (0xFF, 0xFF, 0xFF, 0) for setting the w component to 0 in DP3
+	Label dp3Vector;

 	u32 recompilerPC = 0;  // PC the recompiler is currently recompiling @
 	u32 loopLevel = 0;     // The current loop nesting level (0 = not in a loop)
@ -43,12 +45,17 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	// Shows whether the loaded shader has any log2 and exp2 instructions
 	bool codeHasLog2 = false;
 	bool codeHasExp2 = false;
+	// Whether to compile this shader using accurate, safe, non-IEEE multiplication (slow) or faster but less accurate mul
+	bool useSafeMUL = false;
 	
 	Xbyak::Label log2Func, exp2Func;
 	Xbyak::Label emitLog2Func();
 	Xbyak::Label emitExp2Func();
 	Xbyak::util::Cpu cpuCaps;

+	// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
+	void emitSafeMUL(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
+
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
 	// Compile instruction "instr"
@ -125,7 +132,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	PrologueCallback prologueCb = nullptr;

 	// Initialize our emitter with "allocSize" bytes of RWX memory
-	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
+	ShaderEmitter(bool useSafeMUL) : Xbyak::CodeGenerator(allocSize), useSafeMUL(useSafeMUL) {
 		cpuCaps = Xbyak::util::Cpu();

 		haveSSE4_1 = cpuCaps.has(Xbyak::util::Cpu::tSSE41);
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@ -1,6 +1,7 @@
 #pragma once
 #include <array>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/dynapica/shader_rec.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_vertex.hpp"
@ -13,6 +14,12 @@
 #include "memory.hpp"
 #include "renderer.hpp"

+enum class ShaderExecMode {
+	Interpreter,  // Interpret shaders on the CPU
+	JIT,          // Recompile shaders to CPU machine code
+	Hardware,     // Recompiler shaders to host shaders and run them on the GPU
+};
+
 class GPU {
 	static constexpr u32 regNum = 0x300;
 	static constexpr u32 extRegNum = 0x1000;
@ -45,7 +52,7 @@ class GPU {
 	uint immediateModeVertIndex;
 	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading

-	template <bool indexed, bool useShaderJIT>
+	template <bool indexed, ShaderExecMode mode>
 	void drawArrays();

 	// Silly method of avoiding linking problems. TODO: Change to something less silly
@ -81,6 +88,7 @@ class GPU {
 	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();

+	void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
  public:
 	// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
 	// Encoded in PICA native format
@ -92,6 +100,9 @@ class GPU {
 	// Set to false by the renderer when the lighting_lut is uploaded ot the GPU
 	bool lightingLUTDirty = false;

+	bool fogLUTDirty = false;
+	std::array<uint32_t, 128> fogLUT;
+
 	GPU(Memory& mem, EmulatorConfig& config);
 	void display() { renderer->display(); }
 	void screenshot(const std::string& name) { renderer->screenshot(name); }
@ -164,7 +175,8 @@ class GPU {
 			u32 index = paddr - PhysicalAddrs::VRAM;
 			return (T*)&vram[index];
 		} else [[unlikely]] {
-			Helpers::panic("[GPU] Tried to access unknown physical address: %08X", paddr);
+			Helpers::warn("[GPU] Tried to access unknown physical address: %08X", paddr);
+			return nullptr;
 		}
 	}

--- a/include/PICA/pica_frag_config.hpp
+++ b/include/PICA/pica_frag_config.hpp
@ -0,0 +1,257 @@
+#pragma once
+#include <array>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	struct OutputConfig {
+		union {
+			u32 raw{};
+			// Merge the enable + compare function into 1 field to avoid duplicate shaders
+			// enable == off means a CompareFunction of Always
+			BitField<0, 3, CompareFunction> alphaTestFunction;
+			BitField<3, 1, u32> depthMapEnable;
+			BitField<4, 4, LogicOpMode> logicOpMode;
+		};
+	};
+
+	struct TextureConfig {
+		u32 texUnitConfig;
+		u32 texEnvUpdateBuffer;
+
+		// There's 6 TEV stages, and each one is configured via 4 word-sized registers
+		// (+ the constant color register, which we don't include here, otherwise we'd generate too many shaders)
+		std::array<u32, 4 * 6> tevConfigs;
+	};
+
+	struct FogConfig {
+		union {
+			u32 raw{};
+
+			BitField<0, 3, FogMode> mode;
+			BitField<3, 1, u32> flipDepth;
+		};
+	};
+
+	struct Light {
+		union {
+			u16 raw;
+			BitField<0, 3, u16> num;
+			BitField<3, 1, u16> directional;
+			BitField<4, 1, u16> twoSidedDiffuse;
+			BitField<5, 1, u16> distanceAttenuationEnable;
+			BitField<6, 1, u16> spotAttenuationEnable;
+			BitField<7, 1, u16> geometricFactor0;
+			BitField<8, 1, u16> geometricFactor1;
+			BitField<9, 1, u16> shadowEnable;
+		};
+	};
+
+	struct LightingLUTConfig {
+		union {
+			u32 raw;
+			BitField<0, 1, u32> enable;
+			BitField<1, 1, u32> absInput;
+			BitField<2, 3, u32> type;
+			BitField<5, 3, u32> scale;
+		};
+	};
+
+	struct LightingConfig {
+		union {
+			u32 raw{};
+			BitField<0, 1, u32> enable;
+			BitField<1, 4, u32> lightNum;
+			BitField<5, 2, u32> bumpMode;
+			BitField<7, 2, u32> bumpSelector;
+			BitField<9, 1, u32> bumpRenorm;
+			BitField<10, 1, u32> clampHighlights;
+			BitField<11, 4, u32> config;
+			BitField<15, 1, u32> enablePrimaryAlpha;
+			BitField<16, 1, u32> enableSecondaryAlpha;
+			BitField<17, 1, u32> enableShadow;
+			BitField<18, 1, u32> shadowPrimary;
+			BitField<19, 1, u32> shadowSecondary;
+			BitField<20, 1, u32> shadowInvert;
+			BitField<21, 1, u32> shadowAlpha;
+			BitField<22, 2, u32> shadowSelector;
+		};
+
+		std::array<LightingLUTConfig, 7> luts{};
+
+		std::array<Light, 8> lights{};
+
+		LightingConfig(const std::array<u32, 0x300>& regs) {
+			// Ignore lighting registers if it's disabled
+			if ((regs[InternalRegs::LightingEnable] & 1) == 0) {
+				return;
+			}
+
+			const u32 config0 = regs[InternalRegs::LightConfig0];
+			const u32 config1 = regs[InternalRegs::LightConfig1];
+			const u32 totalLightCount = Helpers::getBits<0, 3>(regs[InternalRegs::LightNumber]) + 1;
+
+			enable = 1;
+			lightNum = totalLightCount;
+
+			enableShadow = Helpers::getBit<0>(config0);
+			if (enableShadow) [[unlikely]] {
+				shadowPrimary = Helpers::getBit<16>(config0);
+				shadowSecondary = Helpers::getBit<17>(config0);
+				shadowInvert = Helpers::getBit<18>(config0);
+				shadowAlpha = Helpers::getBit<19>(config0);
+				shadowSelector = Helpers::getBits<24, 2>(config0);
+			}
+
+			enablePrimaryAlpha = Helpers::getBit<2>(config0);
+			enableSecondaryAlpha = Helpers::getBit<3>(config0);
+			config = Helpers::getBits<4, 4>(config0);
+
+			bumpSelector = Helpers::getBits<22, 2>(config0);
+			clampHighlights = Helpers::getBit<27>(config0);
+			bumpMode = Helpers::getBits<28, 2>(config0);
+			bumpRenorm = Helpers::getBit<30>(config0) ^ 1; // 0 = enable so flip it with xor
+
+			for (int i = 0; i < totalLightCount; i++) {
+				auto& light = lights[i];
+				light.num = (regs[InternalRegs::LightPermutation] >> (i * 4)) & 0x7;
+
+				const u32 lightConfig = regs[InternalRegs::Light0Config + 0x10 * light.num];
+				light.directional = Helpers::getBit<0>(lightConfig);
+				light.twoSidedDiffuse = Helpers::getBit<1>(lightConfig);
+				light.geometricFactor0 = Helpers::getBit<2>(lightConfig);
+				light.geometricFactor1 = Helpers::getBit<3>(lightConfig);
+
+				light.shadowEnable = ((config1 >> light.num) & 1) ^ 1;                      // This also does 0 = enabled
+				light.spotAttenuationEnable = ((config1 >> (8 + light.num)) & 1) ^ 1;       // Same here
+				light.distanceAttenuationEnable = ((config1 >> (24 + light.num)) & 1) ^ 1;  // Of course same here
+			}
+
+			LightingLUTConfig& d0 = luts[Lights::LUT_D0];
+			LightingLUTConfig& d1 = luts[Lights::LUT_D1];
+			LightingLUTConfig& sp = luts[spotlightLutIndex];
+			LightingLUTConfig& fr = luts[Lights::LUT_FR];
+			LightingLUTConfig& rb = luts[Lights::LUT_RB];
+			LightingLUTConfig& rg = luts[Lights::LUT_RG];
+			LightingLUTConfig& rr = luts[Lights::LUT_RR];
+
+			d0.enable = Helpers::getBit<16>(config1) == 0;
+			d1.enable = Helpers::getBit<17>(config1) == 0;
+			fr.enable = Helpers::getBit<19>(config1) == 0;
+			rb.enable = Helpers::getBit<20>(config1) == 0;
+			rg.enable = Helpers::getBit<21>(config1) == 0;
+			rr.enable = Helpers::getBit<22>(config1) == 0;
+			sp.enable = 1;
+
+			const u32 lutAbs = regs[InternalRegs::LightLUTAbs];
+			const u32 lutSelect = regs[InternalRegs::LightLUTSelect];
+			const u32 lutScale = regs[InternalRegs::LightLUTScale];
+
+			if (d0.enable) {
+				d0.absInput = Helpers::getBit<1>(lutAbs) == 0;
+				d0.type = Helpers::getBits<0, 3>(lutSelect);
+				d0.scale = Helpers::getBits<0, 3>(lutScale);
+			}
+
+			if (d1.enable) {
+				d1.absInput = Helpers::getBit<5>(lutAbs) == 0;
+				d1.type = Helpers::getBits<4, 3>(lutSelect);
+				d1.scale = Helpers::getBits<4, 3>(lutScale);
+			}
+
+			sp.absInput = Helpers::getBit<9>(lutAbs) == 0;
+			sp.type = Helpers::getBits<8, 3>(lutSelect);
+			sp.scale = Helpers::getBits<8, 3>(lutScale);
+
+			if (fr.enable) {
+				fr.absInput = Helpers::getBit<13>(lutAbs) == 0;
+				fr.type = Helpers::getBits<12, 3>(lutSelect);
+				fr.scale = Helpers::getBits<12, 3>(lutScale);
+			}
+
+			if (rb.enable) {
+				rb.absInput = Helpers::getBit<17>(lutAbs) == 0;
+				rb.type = Helpers::getBits<16, 3>(lutSelect);
+				rb.scale = Helpers::getBits<16, 3>(lutScale);
+			}
+
+			if (rg.enable) {
+				rg.absInput = Helpers::getBit<21>(lutAbs) == 0;
+				rg.type = Helpers::getBits<20, 3>(lutSelect);
+				rg.scale = Helpers::getBits<20, 3>(lutScale);
+			}
+
+			if (rr.enable) {
+				rr.absInput = Helpers::getBit<25>(lutAbs) == 0;
+				rr.type = Helpers::getBits<24, 3>(lutSelect);
+				rr.scale = Helpers::getBits<24, 3>(lutScale);
+			}
+		}
+	};
+
+	// Config used for identifying unique fragment pipeline configurations
+	struct FragmentConfig {
+		OutputConfig outConfig;
+		TextureConfig texConfig;
+		FogConfig fogConfig;
+		LightingConfig lighting;
+
+		bool operator==(const FragmentConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(FragmentConfig)) == 0;
+		}
+
+		FragmentConfig(const std::array<u32, 0x300>& regs) : lighting(regs) {
+			auto alphaTestConfig = regs[InternalRegs::AlphaTestConfig];
+			auto alphaTestFunction = Helpers::getBits<4, 3>(alphaTestConfig);
+
+			outConfig.alphaTestFunction =
+				(alphaTestConfig & 1) ? static_cast<PICA::CompareFunction>(alphaTestFunction) : PICA::CompareFunction::Always;
+			outConfig.depthMapEnable = regs[InternalRegs::DepthmapEnable] & 1;
+
+			// Shows if blending is enabled. If it is not enabled, then logic ops are enabled instead
+			const bool blendingEnabled = (regs[InternalRegs::ColourOperation] & (1 << 8)) != 0;
+			outConfig.logicOpMode = blendingEnabled ? LogicOpMode::Copy : LogicOpMode(Helpers::getBits<0, 4>(regs[InternalRegs::LogicOp]));
+
+			texConfig.texUnitConfig = regs[InternalRegs::TexUnitCfg];
+			texConfig.texEnvUpdateBuffer = regs[InternalRegs::TexEnvUpdateBuffer];
+
+			// Set up TEV stages. Annoyingly we can't just memcpy as the TEV registers are arranged like
+			// {Source, Operand, Combiner, Color, Scale} and we want to skip the color register since it's uploaded via UBO
+#define setupTevStage(stage)                                                                                    \
+	std::memcpy(&texConfig.tevConfigs[stage * 4], &regs[InternalRegs::TexEnv##stage##Source], 3 * sizeof(u32)); \
+	texConfig.tevConfigs[stage * 4 + 3] = regs[InternalRegs::TexEnv##stage##Source + 4];
+
+			setupTevStage(0);
+			setupTevStage(1);
+			setupTevStage(2);
+			setupTevStage(3);
+			setupTevStage(4);
+			setupTevStage(5);
+#undef setupTevStage
+
+			fogConfig.mode = (FogMode)Helpers::getBits<0, 3>(regs[InternalRegs::TexEnvUpdateBuffer]);
+
+			if (fogConfig.mode == FogMode::Fog) {
+				fogConfig.flipDepth = Helpers::getBit<16>(regs[InternalRegs::TexEnvUpdateBuffer]);
+			}
+		}
+	};
+
+	static_assert(
+		std::has_unique_object_representations<OutputConfig>() && std::has_unique_object_representations<TextureConfig>() &&
+		std::has_unique_object_representations<FogConfig>() && std::has_unique_object_representations<Light>()
+	);
+}  // namespace PICA
+
+// Override std::hash for our fragment config class
+template <>
+struct std::hash<PICA::FragmentConfig> {
+	std::size_t operator()(const PICA::FragmentConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
--- a/include/PICA/pica_frag_uniforms.hpp
+++ b/include/PICA/pica_frag_uniforms.hpp
@ -0,0 +1,47 @@
+#pragma once
+#include <array>
+#include <cstddef>
+#include <type_traits>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct LightUniform {
+		using vec3 = std::array<float, 3>;
+
+		// std140 requires vec3s be aligned to 16 bytes
+		alignas(16) vec3 specular0;
+		alignas(16) vec3 specular1;
+		alignas(16) vec3 diffuse;
+		alignas(16) vec3 ambient;
+		alignas(16) vec3 position;
+		alignas(16) vec3 spotlightDirection;
+		
+		float distanceAttenuationBias;
+		float distanceAttenuationScale;
+	};
+
+	struct FragmentUniforms {
+		using vec3 = std::array<float, 3>;
+		using vec4 = std::array<float, 4>;
+		static constexpr usize tevStageCount = 6;
+
+		s32 alphaReference;
+		float depthScale;
+		float depthOffset;
+
+		alignas(16) vec4 constantColors[tevStageCount];
+		alignas(16) vec4 tevBufferColor;
+		alignas(16) vec4 clipCoords;
+
+		// Note: We upload these as a u32 and decode on GPU.
+		// Particularly the fog colour since fog is really uncommon and it doesn't matter if we decode on GPU.
+		u32 globalAmbientLight;
+		u32 fogColor;
+		// NOTE: THIS MUST BE LAST so that if lighting is disabled we can potentially omit uploading it
+		LightUniform lightUniforms[8];
+	};
+
+	// Assert that lightUniforms is the last member of the structure
+	static_assert(offsetof(FragmentUniforms, lightUniforms) + 8 * sizeof(LightUniform) == sizeof(FragmentUniforms));
+}  // namespace PICA
--- a/include/PICA/pica_simd.hpp
+++ b/include/PICA/pica_simd.hpp
@ -0,0 +1,274 @@
+#pragma once
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include "helpers.hpp"
+
+#if defined(_M_AMD64) || defined(__x86_64__)
+#define PICA_SIMD_X64
+#include <immintrin.h>
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define PICA_SIMD_ARM64
+#include <arm_neon.h>
+#endif
+
+// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
+namespace PICA::IndexBuffer {
+	// Non-SIMD, portable algorithm
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+
+#ifdef PICA_SIMD_ARM64
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// 16-bit indices
+			uint16x8_t minima = vdupq_n_u16(0xffff);
+			uint16x8_t maxima = vdupq_n_u16(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
+				minima = vminq_u16(data, minima);
+				maxima = vmaxq_u16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
+			// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
+			// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
+			// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
+
+			uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
+			uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
+			uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
+
+			uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
+			uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
+			uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
+
+			minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
+			maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
+		} else {
+			// 8-bit indices
+			uint8x16_t minima = vdupq_n_u8(0xff);
+			uint8x16_t maxima = vdupq_n_u8(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				uint8x16_t data = vld1q_u8(indexBuffer);
+				minima = vminq_u8(data, minima);
+				maxima = vmaxq_u8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
+			uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
+			uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
+			uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
+			uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
+
+			uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
+			uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
+			uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
+			uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
+
+			minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
+			maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
+		}
+
+		// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16
+		// indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
+
+			auto horizontalMax16 = [](__m128i vector) -> u16 {
+				// We have an instruction to compute horizontal minimum but not maximum, so we use it.
+				// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
+				__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
+				u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
+				return u16(min ^ 0xffff);
+			};
+
+			// 16-bit indices
+			// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
+			// And the maxima vector to all 0s (0 for each 16-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu16(data, minima);
+				maxima = _mm_max_epu16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin16(minima));
+			maximumIndex = u16(horizontalMax16(maxima));
+		} else {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin8 = [](__m128i vector) -> u8 {
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			auto horizontalMax8 = [](__m128i vector) -> u8 {
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			// 8-bit indices
+			// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
+			// And the maxima vector to all 0s (0 for each 8-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu8(data, minima);
+				maxima = _mm_max_epu8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin8(minima));
+			maximumIndex = u16(horizontalMax8(maxima));
+		}
+
+		// If any indices could not be processed cause the buffer size
+		// is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index
+		// buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+	// Analyzes a PICA index buffer to get the minimum and maximum indices in the
+	// buffer, and returns them in a pair in the form [min, max]. Takes a template
+	// parameter to decide whether the indices in the buffer are u8 or u16
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
+#if defined(PICA_SIMD_ARM64)
+		return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
+#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+		// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
+		return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
+#else
+		return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+#endif
+	}
+
+	// In some really unfortunate scenarios (eg Android Studio emulator), we don't have access to glDrawRangeElementsBaseVertex
+	// So we need to subtract the base vertex index from every index in the index buffer ourselves
+	// This is not really common, so we do it without SIMD for the moment, just to be able to run on Android Studio
+	template <bool useShortIndices>
+	void subtractBaseIndex(u8* indexBuffer, u32 indexCount, u16 baseIndex) {
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer16[i] -= baseIndex;
+			}
+		} else {
+			u8 baseIndex8 = u8(baseIndex);
+
+			for (u32 i = 0; i < indexCount; i++) {
+				indexBuffer[i] -= baseIndex8;
+			}
+		}
+	}
+}  // namespace PICA::IndexBuffer
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@ -0,0 +1,57 @@
+#pragma once
+#include <array>
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	// Configuration struct used
+	struct VertConfig {
+		PICAHash::HashType shaderHash;
+		PICAHash::HashType opdescHash;
+		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
+		bool usingUbershader;
+
+		// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
+		// As the padding will get hashed and memcmp'd...
+		u32 pad{};
+
+		bool operator==(const VertConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
+		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				// Mask out unused bits
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
+			}
+		}
+	};
+}  // namespace PICA
+
+static_assert(sizeof(PICA::VertConfig) == 56);
+
+// Override std::hash for our vertex config class
+template <>
+struct std::hash<PICA::VertConfig> {
+	std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
--- a/include/PICA/regs.hpp
+++ b/include/PICA/regs.hpp
@ -51,6 +51,18 @@ namespace PICA {
 			#undef defineTexEnv
 			// clang-format on

+			// Fog registers
+			FogColor = 0xE1,
+			FogLUTIndex = 0xE6,
+			FogLUTData0 = 0xE8,
+			FogLUTData1 = 0xE9,
+			FogLUTData2 = 0xEA,
+			FogLUTData3 = 0xEB,
+			FogLUTData4 = 0xEC,
+			FogLUTData5 = 0xED,
+			FogLUTData6 = 0xEE,
+			FogLUTData7 = 0xEF,
+
 			// Framebuffer registers
 			ColourOperation = 0x100,
 			BlendFunc = 0x101,
@ -67,7 +79,29 @@ namespace PICA {
 			ColourBufferLoc = 0x11D,
 			FramebufferSize = 0x11E,

-			//LightingRegs
+			// Lighting registers
+			LightingEnable = 0x8F,
+			Light0Specular0 = 0x140,
+			Light0Specular1 = 0x141,
+			Light0Diffuse = 0x142,
+			Light0Ambient = 0x143,
+			Light0XY = 0x144,
+			Light0Z = 0x145,
+			Light0SpotlightXY = 0x146,
+			Light0SpotlightZ = 0x147,
+			Light0Config = 0x149,
+			Light0AttenuationBias = 0x14A,
+			Light0AttenuationScale = 0x14B,
+
+			LightGlobalAmbient = 0x1C0,
+			LightNumber = 0x1C2,
+			LightConfig0 = 0x1C3,
+			LightConfig1 = 0x1C4,
+			LightPermutation = 0x1D9,
+			LightLUTAbs = 0x1D0,
+			LightLUTSelect = 0x1D1,
+			LightLUTScale = 0x1D2,
+
 			LightingLUTIndex =  0x01C5,
 			LightingLUTData0 =  0x01C8,
 			LightingLUTData1 =  0x01C9,
@ -231,7 +265,8 @@ namespace PICA {
 		enum : u32 {
 			LUT_D0 = 0,
 			LUT_D1,
-			LUT_FR,
+			// LUT 2 is not used, the emulator internally uses it for referring to the current source's spotlight in shaders
+			LUT_FR = 0x3,
 			LUT_RB,
 			LUT_RG,
 			LUT_RR,
@ -255,6 +290,11 @@ namespace PICA {
 		};
 	}

+	// There's actually 8 different LUTs (SP0-SP7), one for each light with different indices (8-15)
+	// We use an unused LUT value for "this light source's spotlight" instead and figure out which light source to use in compileLutLookup
+	// This is particularly intuitive in several places, such as checking if a LUT is enabled
+	static constexpr int spotlightLutIndex = 2;
+
 	enum class TextureFmt : u32 {
 		RGBA8 = 0x0,
 		RGB8 = 0x1,
@ -345,4 +385,156 @@ namespace PICA {
 		GeometryPrimitive = 3,
 	};

+	enum class CompareFunction : u32 {
+		Never = 0,
+		Always = 1,
+		Equal = 2,
+		NotEqual = 3,
+		Less = 4,
+		LessOrEqual = 5,
+		Greater = 6,
+		GreaterOrEqual = 7,
+	};
+
+	enum class LogicOpMode : u32 {
+		Clear = 0,
+		And = 1,
+		ReverseAnd = 2,
+		Copy = 3,
+		Set = 4,
+		InvertedCopy = 5,
+		Nop = 6,
+		Invert = 7,
+		Nand = 8,
+		Or = 9,
+		Nor = 10,
+		Xor = 11,
+		Equiv = 12,
+		InvertedAnd = 13,
+		ReverseOr = 14,
+		InvertedOr = 15,
+	};
+
+	enum class FogMode : u32 {
+		Disabled = 0,
+		Fog = 5,
+		Gas = 7,
+	};
+
+	struct TexEnvConfig {
+		enum class Source : u8 {
+			PrimaryColor = 0x0,
+			PrimaryFragmentColor = 0x1,
+			SecondaryFragmentColor = 0x2,
+			Texture0 = 0x3,
+			Texture1 = 0x4,
+			Texture2 = 0x5,
+			Texture3 = 0x6,
+			// TODO: Inbetween values are unknown
+			PreviousBuffer = 0xD,
+			Constant = 0xE,
+			Previous = 0xF,
+		};
+
+		enum class ColorOperand : u8 {
+			SourceColor = 0x0,
+			OneMinusSourceColor = 0x1,
+			SourceAlpha = 0x2,
+			OneMinusSourceAlpha = 0x3,
+			SourceRed = 0x4,
+			OneMinusSourceRed = 0x5,
+			// TODO: Inbetween values are unknown
+			SourceGreen = 0x8,
+			OneMinusSourceGreen = 0x9,
+			// Inbetween values are unknown
+			SourceBlue = 0xC,
+			OneMinusSourceBlue = 0xD,
+		};
+
+		enum class AlphaOperand : u8 {
+			SourceAlpha = 0x0,
+			OneMinusSourceAlpha = 0x1,
+			SourceRed = 0x2,
+			OneMinusSourceRed = 0x3,
+			SourceGreen = 0x4,
+			OneMinusSourceGreen = 0x5,
+			SourceBlue = 0x6,
+			OneMinusSourceBlue = 0x7,
+		};
+
+		enum class Operation : u8 {
+			Replace = 0,
+			Modulate = 1,
+			Add = 2,
+			AddSigned = 3,
+			Lerp = 4,
+			Subtract = 5,
+			Dot3RGB = 6,
+			Dot3RGBA = 7,
+			MultiplyAdd = 8,
+			AddMultiply = 9,
+		};
+
+		// RGB sources
+		Source colorSource1, colorSource2, colorSource3;
+		// Alpha sources
+		Source alphaSource1, alphaSource2, alphaSource3;
+
+		// RGB operands
+		ColorOperand colorOperand1, colorOperand2, colorOperand3;
+		// Alpha operands
+		AlphaOperand alphaOperand1, alphaOperand2, alphaOperand3;
+
+		// Texture environment operations for this stage
+		Operation colorOp, alphaOp;
+
+		u32 constColor;
+
+	  private:
+		// These are the only private members since their value doesn't actually reflect the scale
+		// So we make them public so we'll always use the appropriate member functions instead
+		u8 colorScale;
+		u8 alphaScale;
+
+	  public:
+		// Create texture environment object from TEV registers
+		TexEnvConfig(u32 source, u32 operand, u32 combiner, u32 color, u32 scale) : constColor(color) {
+			colorSource1 = Helpers::getBits<0, 4, Source>(source);
+			colorSource2 = Helpers::getBits<4, 4, Source>(source);
+			colorSource3 = Helpers::getBits<8, 4, Source>(source);
+
+			alphaSource1 = Helpers::getBits<16, 4, Source>(source);
+			alphaSource2 = Helpers::getBits<20, 4, Source>(source);
+			alphaSource3 = Helpers::getBits<24, 4, Source>(source);
+
+			colorOperand1 = Helpers::getBits<0, 4, ColorOperand>(operand);
+			colorOperand2 = Helpers::getBits<4, 4, ColorOperand>(operand);
+			colorOperand3 = Helpers::getBits<8, 4, ColorOperand>(operand);
+
+			alphaOperand1 = Helpers::getBits<12, 3, AlphaOperand>(operand);
+			alphaOperand2 = Helpers::getBits<16, 3, AlphaOperand>(operand);
+			alphaOperand3 = Helpers::getBits<20, 3, AlphaOperand>(operand);
+
+			colorOp = Helpers::getBits<0, 4, Operation>(combiner);
+			alphaOp = Helpers::getBits<16, 4, Operation>(combiner);
+
+			colorScale = Helpers::getBits<0, 2>(scale);
+			alphaScale = Helpers::getBits<16, 2>(scale);
+		}
+
+		u32 getColorScale() { return (colorScale <= 2) ? (1 << colorScale) : 1; }
+		u32 getAlphaScale() { return (alphaScale <= 2) ? (1 << alphaScale) : 1; }
+
+		bool isPassthroughStage() {
+			// clang-format off
+			// Thank you to the Citra dev that wrote this out
+			return (
+				colorOp == Operation::Replace && alphaOp == Operation::Replace &&
+				colorSource1 == Source::Previous && alphaSource1 == Source::Previous &&
+				colorOperand1 == ColorOperand::SourceColor && alphaOperand1 == AlphaOperand::SourceAlpha &&
+				getColorScale() == 1 && getAlphaScale() == 1
+			);
+			// clang-format on
+		}
+	};
 }  // namespace PICA
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@ -1,6 +1,8 @@
 #pragma once
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <cstddef>
 #include <cstring>

 #include "PICA/float_types.hpp"
@ -21,7 +23,7 @@ namespace ShaderOpcodes {
 		DST = 0x04,
 		EX2 = 0x05,
 		LG2 = 0x06,
-		LIT = 0x07,
+		LITP = 0x07,
 		MUL = 0x08,
 		SGE = 0x09,
 		SLT = 0x0A,
@ -56,6 +58,10 @@ namespace ShaderOpcodes {
 	};
 }

+namespace PICA::ShaderGen {
+	class ShaderDecompiler;
+};
+
 // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
 class PICAShader {
 	using f24 = Floats::f24;
@ -90,14 +96,22 @@ class PICAShader {
  public:
 	// These are placed close to the temp registers and co because it helps the JIT generate better code
 	u32 entrypoint = 0;  // Initial shader PC
-	u32 boolUniform;
-	std::array<std::array<u8, 4>, 4> intUniforms;
+
+	// We want these registers in this order & with this alignment for uploading them directly to a UBO
+	// When emulating shaders on the GPU. Plus this alignment for float uniforms is necessary for doing SIMD in the shader->CPU recompilers.
 	alignas(16) std::array<vec4f, 96> floatUniforms;
+	alignas(16) std::array<std::array<u8, 4>, 4> intUniforms;
+	u32 boolUniform;

 	alignas(16) std::array<vec4f, 16> fixedAttributes;  // Fixed vertex attributes
 	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
 	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
+	
+	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
+	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
+	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
+	using Hash = PICAHash::HashType;

  protected:
 	std::array<u32, 128> operandDescriptors;
@ -116,20 +130,20 @@ class PICAShader {
 	std::array<CallInfo, 4> callInfo;
 	ShaderType type;

-	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
-	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
-	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
-	using Hash = PICAHash::HashType;
-
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)

+  public:
+	bool uniformsDirty = false;
+
+  protected:
 	bool codeHashDirty = false;
 	bool opdescHashDirty = false;

 	// Add these as friend classes for the JIT so it has access to all important state
 	friend class ShaderJIT;
 	friend class ShaderEmitter;
+	friend class PICA::ShaderGen::ShaderDecompiler;

 	vec4f getSource(u32 source);
 	vec4f& getDest(u32 dest);
@ -151,6 +165,7 @@ class PICAShader {
 	void jmpc(u32 instruction);
 	void jmpu(u32 instruction);
 	void lg2(u32 instruction);
+	void litp(u32 instruction);
 	void loop(u32 instruction);
 	void mad(u32 instruction);
 	void madi(u32 instruction);
@ -220,13 +235,9 @@ class PICAShader {
  public:
 	static constexpr size_t maxInstructionCount = 4096;
 	std::array<u32, maxInstructionCount> loadedShader;    // Currently loaded & active shader
-	std::array<u32, maxInstructionCount> bufferedShader;  // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to

 	PICAShader(ShaderType type) : type(type) {}

-	// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
-	void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); }
-
 	void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; }
 	void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; }

@ -235,7 +246,7 @@ class PICAShader {
 			Helpers::panic("o no, shader upload overflew");
 		}

-		bufferedShader[bufferIndex++] = word;
+		loadedShader[bufferIndex++] = word;
 		bufferIndex &= 0xfff;

 		codeHashDirty = true;  // Signal the JIT if necessary that the program hash has potentially changed
@ -256,14 +267,16 @@ class PICAShader {

 	void uploadFloatUniform(u32 word) {
 		floatUniformBuffer[floatUniformWordCount++] = word;
-		if (floatUniformIndex >= 96) {
-			Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex);
-		}

 		if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) {
-			vec4f& uniform = floatUniforms[floatUniformIndex++];
 			floatUniformWordCount = 0;

+			// Check if the program tries to upload to a non-existent uniform, and empty the queue without writing in that case
+			if (floatUniformIndex >= 96) [[unlikely]] {
+				return;
+			}
+			vec4f& uniform = floatUniforms[floatUniformIndex++];
+
 			if (f32UniformTransfer) {
 				uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
 				uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
@ -275,6 +288,7 @@ class PICAShader {
 				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
 				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
+			uniformsDirty = true;
 		}
 	}

@ -286,6 +300,12 @@ class PICAShader {
 		u[1] = getBits<8, 8>(word);
 		u[2] = getBits<16, 8>(word);
 		u[3] = getBits<24, 8>(word);
+		uniformsDirty = true;
+	}
+
+	void uploadBoolUniform(u32 value) {
+		boolUniform = value;
+		uniformsDirty = true;
 	}

 	void run();
@ -293,4 +313,13 @@ class PICAShader {

 	Hash getCodeHash();
 	Hash getOpdescHash();
-};
+
+	// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
+	static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
+	void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
+};
+
+static_assert(
+	offsetof(PICAShader, intUniforms) == offsetof(PICAShader, floatUniforms) + 96 * sizeof(float) * 4 &&
+	offsetof(PICAShader, boolUniform) == offsetof(PICAShader, intUniforms) + 4 * sizeof(u8) * 4
+);
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@ -0,0 +1,131 @@
+#pragma once
+#include <fmt/format.h>
+
+#include <map>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "PICA/shader.hpp"
+#include "PICA/shader_gen_types.hpp"
+
+struct EmulatorConfig;
+
+namespace PICA::ShaderGen {
+	// Control flow analysis is partially based on
+	// https://github.com/PabloMK7/citra/blob/d0179559466ff09731d74474322ee880fbb44b00/src/video_core/shader/generator/glsl_shader_decompiler.cpp#L33
+	struct ControlFlow {
+		// A continuous range of addresses
+		struct AddressRange {
+			u32 start, end;
+			AddressRange(u32 start, u32 end) : start(start), end(end) {}
+
+			// Use lexicographic comparison for functions in order to sort them in a set
+			bool operator<(const AddressRange& other) const { return std::tie(start, end) < std::tie(other.start, other.end); }
+		};
+
+		struct Function {
+			using Labels = std::set<u32>;
+
+			enum class ExitMode {
+				Unknown,       // Can't guarantee whether we'll exit properly, fall back to CPU shaders (can happen with jmp shenanigans)
+				AlwaysReturn,  // All paths reach the return point.
+				Conditional,   // One or more code paths reach the return point or an END instruction conditionally.
+				AlwaysEnd,     // All paths reach an END instruction.
+			};
+
+			u32 start;           // Starting PC of the function
+			u32 end;             // End PC of the function
+			Labels outLabels{};  // Labels this function can "goto" (jump) to
+			ExitMode exitMode = ExitMode::Unknown;
+
+			explicit Function(u32 start, u32 end) : start(start), end(end) {}
+			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
+
+			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
+			// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
+			// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
+			// from within functions deep in the callstack
+			std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
+			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
+		};
+
+		std::set<Function> functions{};
+		std::map<AddressRange, Function::ExitMode> exitMap{};
+
+		// Tells us whether analysis of the shader we're trying to compile failed, in which case we'll need to fail back to shader emulation
+		// On the CPU
+		bool analysisFailed = false;
+
+		// This will recursively add all functions called by the function too, as analyzeFunction will call addFunction on control flow instructions
+		const Function* addFunction(const PICAShader& shader, u32 start, u32 end) {
+			auto searchIterator = functions.find(Function(start, end));
+			if (searchIterator != functions.end()) {
+				return &(*searchIterator);
+			}
+
+			// Add this function and analyze it if it doesn't already exist
+			Function function(start, end);
+			function.exitMode = analyzeFunction(shader, start, end, function.outLabels);
+			
+			// This function could not be fully analyzed, report failure
+			if (function.exitMode == Function::ExitMode::Unknown) {
+				analysisFailed = true;
+				return nullptr;
+			}
+
+			// Add function to our function list
+			auto [it, added] = functions.insert(std::move(function));
+			return &(*it);
+		}
+
+		void analyze(const PICAShader& shader, u32 entrypoint);
+		Function::ExitMode analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels);
+	};
+
+	class ShaderDecompiler {
+		using AddressRange = ControlFlow::AddressRange;
+		using Function = ControlFlow::Function;
+
+		ControlFlow controlFlow{};
+
+		PICAShader& shader;
+		EmulatorConfig& config;
+		std::string decompiledShader;
+
+		u32 entrypoint;
+
+		API api;
+		Language language;
+		bool compilationError = false;
+
+		void compileInstruction(u32& pc, bool& finished);
+		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
+		std::pair<u32, bool> compileRange(const AddressRange& range);
+		void callFunction(const Function& function);
+		const Function* findFunction(const AddressRange& range);
+
+		void writeAttributes();
+
+		std::string getSource(u32 source, u32 index) const;
+		std::string getDest(u32 dest) const;
+		std::string getSwizzlePattern(u32 swizzle) const;
+		std::string getDestSwizzle(u32 destinationMask) const;
+		const char* getCondition(u32 cond, u32 refX, u32 refY);
+
+		void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
+		// Returns if the instruction uses the typical register encodings most instructions use
+		// With some exceptions like MAD/MADI, and the control flow instructions which are completely different
+		bool usesCommonEncoding(u32 instruction) const;
+
+	  public:
+		ShaderDecompiler(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language)
+			: shader(shader), entrypoint(entrypoint), config(config), api(api), language(language), decompiledShader("") {}
+
+		std::string decompile();
+	};
+
+	std::string decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language);
+}  // namespace PICA::ShaderGen
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@ -0,0 +1,43 @@
+#pragma once
+#include <string>
+
+#include "PICA/gpu.hpp"
+#include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader_gen_types.hpp"
+#include "helpers.hpp"
+
+namespace PICA::ShaderGen {
+	class FragmentGenerator {
+		API api;
+		Language language;
+
+		void compileTEV(std::string& shader, int stage, const PICA::FragmentConfig& config);
+		void getSource(std::string& shader, PICA::TexEnvConfig::Source source, int index, const PICA::FragmentConfig& config);
+		void getColorOperand(std::string& shader, PICA::TexEnvConfig::Source source, PICA::TexEnvConfig::ColorOperand color, int index, const PICA::FragmentConfig& config);
+		void getAlphaOperand(std::string& shader, PICA::TexEnvConfig::Source source, PICA::TexEnvConfig::AlphaOperand alpha, int index, const PICA::FragmentConfig& config);
+		void getColorOperation(std::string& shader, PICA::TexEnvConfig::Operation op);
+		void getAlphaOperation(std::string& shader, PICA::TexEnvConfig::Operation op);
+
+		void applyAlphaTest(std::string& shader, const PICA::FragmentConfig& config);
+		void compileLights(std::string& shader, const PICA::FragmentConfig& config);
+		void compileLUTLookup(std::string& shader, const PICA::FragmentConfig& config, u32 lightIndex, u32 lutID);
+		bool isSamplerEnabled(u32 environmentID, u32 lutID);
+
+		void compileFog(std::string& shader, const PICA::FragmentConfig& config);
+		void compileLogicOps(std::string& shader, const PICA::FragmentConfig& config);
+
+	  public:
+		FragmentGenerator(API api, Language language) : api(api), language(language) {}
+		std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
+		std::string getDefaultVertexShader();
+		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
+
+		void setTarget(API api, Language language) {
+			this->api = api;
+			this->language = language;
+		}
+	};
+};  // namespace PICA::ShaderGen
--- a/include/PICA/shader_gen_types.hpp
+++ b/include/PICA/shader_gen_types.hpp
@ -0,0 +1,9 @@
+#pragma once
+
+namespace PICA::ShaderGen {
+	// Graphics API this shader is targetting
+	enum class API { GL, GLES, Vulkan };
+
+	// Shading language to use (Only GLSL for the time being)
+	enum class Language { GLSL };
+}  // namespace PICA::ShaderGen
--- a/include/PICA/shader_unit.hpp
+++ b/include/PICA/shader_unit.hpp
@ -2,10 +2,9 @@
 #include "PICA/shader.hpp"

 class ShaderUnit {
-
-public:
-	PICAShader vs; // Vertex shader
-	PICAShader gs; // Geometry shader
+  public:
+	PICAShader vs;  // Vertex shader
+	PICAShader gs;  // Geometry shader

 	ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
 	void reset();
--- a/include/align.hpp
+++ b/include/align.hpp
@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _WIN32
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
--- a/include/audio/aac.hpp
+++ b/include/audio/aac.hpp
@ -0,0 +1,82 @@
+#pragma once
+#include <array>
+#include <type_traits>
+
+#include "helpers.hpp"
+#include "swap.hpp"
+
+namespace Audio::AAC {
+	namespace ResultCode {
+		enum : u32 {
+			Success = 0,
+		};
+	}
+
+	// Enum values and struct definitions based off Citra
+	namespace Command {
+		enum : u16 {
+			Init = 0,          // Initialize encoder/decoder
+			EncodeDecode = 1,  // Encode/Decode AAC
+			Shutdown = 2,      // Shutdown encoder/decoder
+			LoadState = 3,
+			SaveState = 4,
+		};
+	}
+
+	namespace SampleRate {
+		enum : u32 {
+			Rate48000 = 0,
+			Rate44100 = 1,
+			Rate32000 = 2,
+			Rate24000 = 3,
+			Rate22050 = 4,
+			Rate16000 = 5,
+			Rate12000 = 6,
+			Rate11025 = 7,
+			Rate8000 = 8,
+		};
+	}
+
+	namespace Mode {
+		enum : u16 {
+			None = 0,
+			Decode = 1,
+			Encode = 2,
+		};
+	}
+
+	struct DecodeResponse {
+		u32_le sampleRate;
+		u32_le channelCount;
+		u32_le size;
+		u32_le unknown1;
+		u32_le unknown2;
+		u32_le sampleCount;
+	};
+
+	struct DecodeRequest {
+		u32_le address;        // Address of input AAC stream
+		u32_le size;           // Size of input AAC stream
+		u32_le destAddrLeft;   // Output address for left channel samples
+		u32_le destAddrRight;  // Output address for right channel samples
+		u32_le unknown1;
+		u32_le unknown2;
+	};
+
+	struct Message {
+		u16_le mode = Mode::None;  // Encode or decode AAC?
+		u16_le command = Command::Init;
+		u32_le resultCode = ResultCode::Success;
+
+		// Info on the AAC request
+		union {
+			std::array<u8, 24> commandData{};
+
+			DecodeResponse decodeResponse;
+			DecodeRequest decodeRequest;
+		};
+	};
+
+	static_assert(sizeof(Message) == 32);
+	static_assert(std::is_trivially_copyable<Message>());
+}  // namespace Audio::AAC
--- a/include/audio/aac_decoder.hpp
+++ b/include/audio/aac_decoder.hpp
@ -0,0 +1,24 @@
+#pragma once
+#include <functional>
+
+#include "audio/aac.hpp"
+#include "helpers.hpp"
+
+struct AAC_DECODER_INSTANCE;
+
+namespace Audio::AAC {
+	class Decoder {
+		using DecoderHandle = AAC_DECODER_INSTANCE*;
+		using PaddrCallback = std::function<u8*(u32)>;
+
+		DecoderHandle decoderHandle = nullptr;
+
+		bool isInitialized() { return decoderHandle != nullptr; }
+		void initialize();
+
+	  public:
+		// Decode function. Takes in a reference to the AAC response & request, and a callback for paddr -> pointer conversions
+		void decode(AAC::Message& response, const AAC::Message& request, PaddrCallback paddrCallback);
+		~Decoder();
+	};
+}  // namespace Audio::AAC
--- a/include/audio/dsp_core.hpp
+++ b/include/audio/dsp_core.hpp
@ -43,7 +43,7 @@ namespace Audio {
 		virtual ~DSPCore() {}

 		virtual void reset() = 0;
-		virtual void runAudioFrame() = 0;
+		virtual void runAudioFrame(u64 eventTimestamp) = 0;
 		virtual u8* getDspMemory() = 0;

 		virtual u16 recvData(u32 regId) = 0;
--- a/include/audio/dsp_shared_mem.hpp
+++ b/include/audio/dsp_shared_mem.hpp
@ -294,12 +294,12 @@ namespace Audio::HLE {

 	struct SourceStatus {
 		struct Status {
-			u8 isEnabled;             ///< Is this channel enabled? (Doesn't have to be playing anything.)
+			u8 enabled;               ///< Is this channel enabled? (Doesn't have to be playing anything.)
 			u8 currentBufferIDDirty;  ///< Non-zero when current_buffer_id changes
 			u16_le syncCount;         ///< Is set by the DSP to the value of SourceConfiguration::sync_count
-			u32_dsp bufferPosition;   ///< Number of samples into the current buffer
+			u32_dsp samplePosition;   ///< Number of samples into the current buffer
 			u16_le currentBufferID;   ///< Updated when a buffer finishes playing
-			u16_le lastBufferID;      ///< Updated when all buffers in the queue finish playing
+			u16_le previousBufferID;  ///< Updated when all buffers in the queue finish playing
 		};

 		Status status[sourceCount];
@ -324,8 +324,8 @@ namespace Audio::HLE {
 			BitField<15, 1, u32> outputBufferCountDirty;
 			BitField<16, 1, u32> masterVolumeDirty;

-			BitField<24, 1, u32> auxReturnVolume0Dirty;
-			BitField<25, 1, u32> auxReturnVolume1Dirty;
+			BitField<24, 1, u32> auxVolume0Dirty;
+			BitField<25, 1, u32> auxVolume1Dirty;
 			BitField<26, 1, u32> outputFormatDirty;
 			BitField<27, 1, u32> clippingModeDirty;
 			BitField<28, 1, u32> headphonesConnectedDirty;
@ -337,7 +337,7 @@ namespace Audio::HLE {
 		/// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for
 		/// each at the final mixer.
 		float_le masterVolume;
-		std::array<float_le, 2> auxReturnVolume;
+		std::array<float_le, 2> auxVolumes;

 		u16_le outputBufferCount;
 		u16 pad1[2];
@ -422,7 +422,7 @@ namespace Audio::HLE {

 	struct DspStatus {
 		u16_le unknown;
-		u16_le dropped_frames;
+		u16_le droppedFrames;
 		u16 pad0[0xE];
 	};
 	ASSERT_DSP_STRUCT(DspStatus, 32);
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@ -2,9 +2,12 @@
 #include <array>
 #include <cassert>
 #include <deque>
+#include <memory>
 #include <queue>
 #include <vector>

+#include "audio/aac.hpp"
+#include "audio/aac_decoder.hpp"
 #include "audio/dsp_core.hpp"
 #include "audio/dsp_shared_mem.hpp"
 #include "memory.hpp"
@ -41,15 +44,25 @@ namespace Audio {
 				return this->bufferID > other.bufferID;
 			}
 		};
+
 		// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
 		using SampleBuffer = std::deque<std::array<s16, 2>>;

 		using BufferQueue = std::priority_queue<Buffer>;
 		BufferQueue buffers;

+		SampleFormat sampleFormat = SampleFormat::ADPCM;
+		SourceType sourceType = SourceType::Stereo;
+
 		std::array<float, 3> gain0, gain1, gain2;
+		u32 samplePosition;  // Sample number into the current audio buffer
+		float rateMultiplier;
 		u16 syncCount;
-		bool enabled;  // Is the source enabled?
+		u16 currentBufferID;
+		u16 previousBufferID;
+
+		bool enabled;                  // Is the source enabled?
+		bool isBufferIDDirty = false;  // Did we change buffers?

 		// ADPCM decoding info:
 		// An array of fixed point S5.11 coefficients. These provide "weights" for the history samples
@ -65,6 +78,10 @@ namespace Audio {
 		int index = 0;  // Index of the voice in [0, 23] for debugging

 		void reset();
+
+		// Push a buffer to the buffer queue
+		void pushBuffer(const Buffer& buffer) { buffers.push(buffer); }
+
 		// Pop a buffer from the buffer queue and return it
 		Buffer popBuffer() {
 			assert(!buffers.empty());
@ -78,8 +95,7 @@ namespace Audio {
 		DSPSource() { reset(); }
 	};

-	class HLE_DSP : public DSPCore {
-		// The audio frame types are public in case we want to use them for unit tests
+	class DSPMixer {
 	  public:
 		template <typename T, usize channelCount = 1>
 		using Sample = std::array<T, channelCount>;
@ -96,6 +112,43 @@ namespace Audio {
 		template <typename T>
 		using QuadFrame = Frame<T, 4>;

+	  private:
+		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
+		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
+		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
+		static constexpr usize mixerStageCount = 3;
+
+	  public:
+		ChannelFormat channelFormat = ChannelFormat::Stereo;
+		std::array<float, mixerStageCount> volumes;
+		std::array<bool, 2> enableAuxStages;
+
+		void reset() {
+			channelFormat = ChannelFormat::Stereo;
+
+			volumes.fill(0.0);
+			enableAuxStages.fill(false);
+		}
+	};
+
+	class HLE_DSP : public DSPCore {
+		// The audio frame types are public in case we want to use them for unit tests
+	  public:
+		template <typename T, usize channelCount = 1>
+		using Sample = DSPMixer::Sample<T, channelCount>;
+
+		template <typename T, usize channelCount>
+		using Frame = DSPMixer::Frame<T, channelCount>;
+
+		template <typename T>
+		using MonoFrame = DSPMixer::MonoFrame<T>;
+
+		template <typename T>
+		using StereoFrame = DSPMixer::StereoFrame<T>;
+
+		template <typename T>
+		using QuadFrame = DSPMixer::QuadFrame<T>;
+
 		using Source = Audio::DSPSource;
 		using SampleBuffer = Source::SampleBuffer;

@ -114,8 +167,8 @@ namespace Audio {
 		std::array<Source, Audio::HLE::sourceCount> sources;  // DSP voices
 		Audio::HLE::DspMemory dspRam;

-		SampleFormat sampleFormat = SampleFormat::ADPCM;
-		SourceType sourceType = SourceType::Stereo;
+		Audio::DSPMixer mixer;
+		std::unique_ptr<Audio::AAC::Decoder> aacDecoder;

 		void resetAudioPipe();
 		bool loaded = false;  // Have we loaded a component?
@ -132,7 +185,7 @@ namespace Audio {
 			} else if (counter1 == 0xffff && counter0 != 0xfffe) {
 				return 0;
 			} else {
-				return counter0 > counter1 ? 0 : 0;
+				return (counter0 > counter1) ? 0 : 1;
 			}
 		}

@ -157,11 +210,20 @@ namespace Audio {
 			}
 		}

+		void handleAACRequest(const AAC::Message& request);
 		void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
+		void updateMixerConfig(HLE::SharedMemory& sharedMem);
 		void generateFrame(StereoFrame<s16>& frame);
+		void generateFrame(DSPSource& source);
 		void outputFrame();
+		// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
+		void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
+		
 		// Decode an entire buffer worth of audio
 		void decodeBuffer(DSPSource& source);
+
+		SampleBuffer decodePCM8(const u8* data, usize sampleCount, Source& source);
+		SampleBuffer decodePCM16(const u8* data, usize sampleCount, Source& source);
 		SampleBuffer decodeADPCM(const u8* data, usize sampleCount, Source& source);

 	  public:
@ -169,7 +231,7 @@ namespace Audio {
 		~HLE_DSP() override {}

 		void reset() override;
-		void runAudioFrame() override;
+		void runAudioFrame(u64 eventTimestamp) override;

 		u8* getDspMemory() override { return dspRam.rawMemory.data(); }

--- a/include/audio/null_core.hpp
+++ b/include/audio/null_core.hpp
@ -27,7 +27,7 @@ namespace Audio {
 		~NullDSP() override {}

 		void reset() override;
-		void runAudioFrame() override;
+		void runAudioFrame(u64 eventTimestamp) override;

 		u8* getDspMemory() override { return dspRam.data(); }

--- a/include/audio/teakra_core.hpp
+++ b/include/audio/teakra_core.hpp
@ -83,7 +83,7 @@ namespace Audio {
 		void reset() override;

 		// Run 1 slice of DSP instructions and schedule the next audio frame
-		void runAudioFrame() override {
+		void runAudioFrame(u64 eventTimestamp) override {
 			runSlice();
 			scheduler.addEvent(Scheduler::EventType::RunDSP, scheduler.currentTimestamp + Audio::lleSlice * 2);
 		}
--- a/include/config.hpp
+++ b/include/config.hpp
@ -7,16 +7,33 @@
 // Remember to initialize every field here to its default value otherwise bad things will happen
 struct EmulatorConfig {
 	// Only enable the shader JIT by default on platforms where it's completely tested
-#ifdef PANDA3DS_X64_HOST
+#if defined(PANDA3DS_X64_HOST) || defined(PANDA3DS_ARM64_HOST)
 	static constexpr bool shaderJitDefault = true;
 #else
 	static constexpr bool shaderJitDefault = false;
 #endif

+	// For now, use specialized shaders by default on MacOS as M1 drivers are buggy when using the ubershader, and on Android since mobile GPUs are
+	// horrible. On other platforms we default to ubershader + shadergen fallback for lights
+#if defined(__ANDROID__) || defined(__APPLE__)
+	static constexpr bool ubershaderDefault = false;
+#else
+	static constexpr bool ubershaderDefault = true;
+#endif
+	static constexpr bool accelerateShadersDefault = true;
+
 	bool shaderJitEnabled = shaderJitDefault;
+	bool useUbershaders = ubershaderDefault;
+	bool accelerateShaders = accelerateShadersDefault;
+	bool accurateShaderMul = false;
 	bool discordRpcEnabled = false;
+
+	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
+	bool forceShadergenForLights = true;
+	int lightShadergenThreshold = 1;
+
 	RendererType rendererType = RendererType::OpenGL;
-	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;
+	Audio::DSPCore::Type dspType = Audio::DSPCore::Type::HLE;

 	bool sdCardInserted = true;
 	bool sdWriteProtected = false;
@ -25,6 +42,9 @@ struct EmulatorConfig {
 	bool audioEnabled = false;
 	bool vsyncEnabled = true;

+	bool enableRenderdoc = false;
+	bool printAppVersion = true;
+
 	bool chargerPlugged = true;
 	// Default to 3% battery to make users suffer
 	int batteryPercentage = 3;
@ -33,7 +53,25 @@ struct EmulatorConfig {
 	std::filesystem::path defaultRomPath = "";
 	std::filesystem::path filePath;

+	// Frontend window settings
+	struct WindowSettings {
+		static constexpr int defaultX = 200;
+		static constexpr int defaultY = 200;
+		static constexpr int defaultWidth = 800;
+		static constexpr int defaultHeight = 240 * 2;
+
+		bool rememberPosition = false;  // Remember window position & size
+		bool showAppVersion = false;
+
+		int x = defaultX;
+		int y = defaultY;
+		int width = defaultHeight;
+		int height = defaultHeight;
+	};
+
+	WindowSettings windowSettings;
+
 	EmulatorConfig(const std::filesystem::path& path);
 	void load();
 	void save();
-};
+};
--- a/include/crypto/aes_engine.hpp
+++ b/include/crypto/aes_engine.hpp
@ -1,20 +1,29 @@
 #pragma once

 #include <array>
-#include <cstring>
-#include <cstdint>
 #include <climits>
+#include <cstdint>
+#include <cstring>
 #include <filesystem>
 #include <optional>
+#include <vector>

 #include "helpers.hpp"
+#include "io_file.hpp"
+#include "swap.hpp"

 namespace Crypto {
-	constexpr std::size_t AesKeySize = 0x10;
+	constexpr usize AesKeySize = 0x10;
 	using AESKey = std::array<u8, AesKeySize>;

-	template <std::size_t N>
-	static std::array<u8, N> rolArray(const std::array<u8, N>& value, std::size_t bits) {
+	struct Seed {
+		u64_le titleID;
+		AESKey seed;
+		std::array<u8, 8> pad;
+	};
+
+	template <usize N>
+	static std::array<u8, N> rolArray(const std::array<u8, N>& value, usize bits) {
 		const auto bitWidth = N * CHAR_BIT;

 		bits %= bitWidth;
@ -24,18 +33,18 @@ namespace Crypto {

 		std::array<u8, N> result;

-		for (std::size_t i = 0; i < N; i++) {
+		for (usize i = 0; i < N; i++) {
 			result[i] = ((value[(i + byteShift) % N] << bitShift) | (value[(i + byteShift + 1) % N] >> (CHAR_BIT - bitShift))) & UINT8_MAX;
 		}

 		return result;
 	}

-	template <std::size_t N>
+	template <usize N>
 	static std::array<u8, N> addArray(const std::array<u8, N>& a, const std::array<u8, N>& b) {
 		std::array<u8, N> result;
-		std::size_t sum = 0;
-		std::size_t carry = 0;
+		usize sum = 0;
+		usize carry = 0;

 		for (std::int64_t i = N - 1; i >= 0; i--) {
 			sum = a[i] + b[i] + carry;
@ -46,11 +55,11 @@ namespace Crypto {
 		return result;
 	}

-	template <std::size_t N>
+	template <usize N>
 	static std::array<u8, N> xorArray(const std::array<u8, N>& a, const std::array<u8, N>& b) {
 		std::array<u8, N> result;

-		for (std::size_t i = 0; i < N; i++) {
+		for (usize i = 0; i < N; i++) {
 			result[i] = a[i] ^ b[i];
 		}

@ -63,7 +72,7 @@ namespace Crypto {
 		}

 		AESKey rawKey;
-		for (std::size_t i = 0; i < rawKey.size(); i++) {
+		for (usize i = 0; i < rawKey.size(); i++) {
 			rawKey[i] = static_cast<u8>(std::stoi(hex.substr(i * 2, 2), 0, 16));
 		}

@ -76,7 +85,7 @@ namespace Crypto {
 		std::optional<AESKey> normalKey = std::nullopt;
 	};

-	enum KeySlotId : std::size_t {
+	enum KeySlotId : usize {
 		NCCHKey0 = 0x2C,
 		NCCHKey1 = 0x25,
 		NCCHKey2 = 0x18,
@ -84,14 +93,17 @@ namespace Crypto {
 	};

 	class AESEngine {
-	private:
-		constexpr static std::size_t AesKeySlotCount = 0x40;
+	  private:
+		constexpr static usize AesKeySlotCount = 0x40;

 		std::optional<AESKey> m_generator = std::nullopt;
 		std::array<AESKeySlot, AesKeySlotCount> m_slots;
 		bool keysLoaded = false;

-		constexpr void updateNormalKey(std::size_t slotId) {
+		std::vector<Seed> seeds;
+		IOFile seedDatabase;
+
+		constexpr void updateNormalKey(usize slotId) {
 			if (m_generator.has_value() && hasKeyX(slotId) && hasKeyY(slotId)) {
 				auto& keySlot = m_slots.at(slotId);
 				AESKey keyX = keySlot.keyX.value();
@ -101,13 +113,17 @@ namespace Crypto {
 			}
 		}

-	public:
+	  public:
 		AESEngine() {}
 		void loadKeys(const std::filesystem::path& path);
+		void setSeedPath(const std::filesystem::path& path);
+		// Returns true on success, false on failure
+		bool loadSeeds();
+
 		bool haveKeys() { return keysLoaded; }
 		bool haveGenerator() { return m_generator.has_value(); }

-		constexpr bool hasKeyX(std::size_t slotId) {
+		constexpr bool hasKeyX(usize slotId) {
 			if (slotId >= AesKeySlotCount) {
 				return false;
 			}
@ -115,18 +131,16 @@ namespace Crypto {
 			return m_slots.at(slotId).keyX.has_value();
 		}

-		constexpr AESKey getKeyX(std::size_t slotId) {
-			return m_slots.at(slotId).keyX.value_or(AESKey{});
-		}
+		constexpr AESKey getKeyX(usize slotId) { return m_slots.at(slotId).keyX.value_or(AESKey{}); }

-		constexpr void setKeyX(std::size_t slotId, const AESKey &key) {
+		constexpr void setKeyX(usize slotId, const AESKey& key) {
 			if (slotId < AesKeySlotCount) {
 				m_slots.at(slotId).keyX = key;
 				updateNormalKey(slotId);
 			}
 		}

-		constexpr bool hasKeyY(std::size_t slotId) {
+		constexpr bool hasKeyY(usize slotId) {
 			if (slotId >= AesKeySlotCount) {
 				return false;
 			}
@ -134,18 +148,16 @@ namespace Crypto {
 			return m_slots.at(slotId).keyY.has_value();
 		}

-		constexpr AESKey getKeyY(std::size_t slotId) {
-			return m_slots.at(slotId).keyY.value_or(AESKey{});
-		}
+		constexpr AESKey getKeyY(usize slotId) { return m_slots.at(slotId).keyY.value_or(AESKey{}); }

-		constexpr void setKeyY(std::size_t slotId, const AESKey &key) {
+		constexpr void setKeyY(usize slotId, const AESKey& key) {
 			if (slotId < AesKeySlotCount) {
 				m_slots.at(slotId).keyY = key;
 				updateNormalKey(slotId);
 			}
 		}

-		constexpr bool hasNormalKey(std::size_t slotId) {
+		constexpr bool hasNormalKey(usize slotId) {
 			if (slotId >= AesKeySlotCount) {
 				return false;
 			}
@ -153,14 +165,14 @@ namespace Crypto {
 			return m_slots.at(slotId).normalKey.has_value();
 		}

-		constexpr AESKey getNormalKey(std::size_t slotId) {
-			return m_slots.at(slotId).normalKey.value_or(AESKey{});
-		}
+		constexpr AESKey getNormalKey(usize slotId) { return m_slots.at(slotId).normalKey.value_or(AESKey{}); }

-		constexpr void setNormalKey(std::size_t slotId, const AESKey &key) {
+		constexpr void setNormalKey(usize slotId, const AESKey& key) {
 			if (slotId < AesKeySlotCount) {
 				m_slots.at(slotId).normalKey = key;
 			}
 		}
+
+		std::optional<AESKey> getSeedFromDB(u64 titleID);
 	};
-}
+}  // namespace Crypto
--- a/include/emulator.hpp
+++ b/include/emulator.hpp
@ -135,4 +135,7 @@ class Emulator {
 	std::filesystem::path getAppDataRoot();

 	std::span<u8> getSMDH();
+
+  private:
+	void loadRenderdoc();
 };
--- a/include/kernel/handles.hpp
+++ b/include/kernel/handles.hpp
@ -1,7 +1,7 @@
 #pragma once
 #include "helpers.hpp"

-using Handle = u32;
+using HorizonHandle = u32;

 namespace KernelHandles {
 	enum : u32 {
@ -61,17 +61,17 @@ namespace KernelHandles {
 	};

 	// Returns whether "handle" belongs to one of the OS services
-	static constexpr bool isServiceHandle(Handle handle) {
+	static constexpr bool isServiceHandle(HorizonHandle handle) {
 		return handle >= MinServiceHandle && handle <= MaxServiceHandle;
 	}

 	// Returns whether "handle" belongs to one of the OS services' shared memory areas
-	static constexpr bool isSharedMemHandle(Handle handle) {
+	static constexpr bool isSharedMemHandle(HorizonHandle handle) {
 		return handle >= MinSharedMemHandle && handle <= MaxSharedMemHandle;
 	}

 	// Returns the name of a handle as a string based on the given handle
-	static const char* getServiceName(Handle handle) {
+	static const char* getServiceName(HorizonHandle handle) {
 		switch (handle) {
 			case AC: return "AC";
 			case ACT: return "ACT";
--- a/include/kernel/kernel.hpp
+++ b/include/kernel/kernel.hpp
@ -16,8 +16,11 @@
 #include "services/service_manager.hpp"

 class CPU;
+struct Scheduler;

 class Kernel {
+	using Handle = HorizonHandle;
+
 	std::span<u32, 16> regs;
 	CPU& cpu;
 	Memory& mem;
@ -247,6 +250,7 @@ public:

 	ServiceManager& getServiceManager() { return serviceManager; }
 	KFcram& getFcramManager() { return fcramManager; }
+	Scheduler& getScheduler();

 	void sendGPUInterrupt(GPUInterrupt type) { serviceManager.sendGPUInterrupt(type); }
 	void clearInstructionCache();
--- a/include/kernel/kernel_types.hpp
+++ b/include/kernel/kernel_types.hpp
@ -53,7 +53,7 @@ enum class FcramRegion {
 struct AddressArbiter {};

 struct ResourceLimits {
-    Handle handle;
+    HorizonHandle handle;

    s32 currentCommit = 0;
 };
@ -97,6 +97,8 @@ struct Port {
 };

 struct Session {
+	using Handle = HorizonHandle;
+
 	Handle portHandle;  // The port this session is subscribed to
 	Session(Handle portHandle) : portHandle(portHandle) {}
 };
@ -115,6 +117,8 @@ enum class ThreadStatus {
 };

 struct Thread {
+	using Handle = HorizonHandle;
+
 	u32 initialSP;   // Initial r13 value
 	u32 entrypoint;  // Initial r15 value
 	u32 priority;
@ -167,6 +171,8 @@ static const char* kernelObjectTypeToString(KernelObjectType t) {
 }

 struct Mutex {
+    using Handle = HorizonHandle;
+
    u64 waitlist;           // Refer to the getWaitlist function below for documentation
    Handle ownerThread = 0; // Index of the thread that holds the mutex if it's locked
    Handle handle; // Handle of the mutex itself
@ -209,6 +215,8 @@ struct MemoryBlock {

 // Generic kernel object class
 struct KernelObject {
+	using Handle = HorizonHandle;
+
    Handle handle = 0; // A u32 the OS will use to identify objects
    void* data = nullptr;
    KernelObjectType type;
--- a/include/loader/ncch.hpp
+++ b/include/loader/ncch.hpp
@ -50,6 +50,7 @@ struct NCCH {

 	static constexpr u64 mediaUnit = 0x200;
 	u64 size = 0;  // Size of NCCH converted to bytes
+	u64 saveDataSize = 0;
 	u32 stackSize = 0;
 	u32 bssSize = 0;
 	u32 exheaderSize = 0;
@ -60,10 +61,10 @@ struct NCCH {
 	CodeSetInfo text, data, rodata;
 	FSInfo partitionInfo;

+	std::optional<Crypto::AESKey> primaryKey, secondaryKey;
+
 	// Contents of the .code file in the ExeFS
 	std::vector<u8> codeFile;
-	// Contains of the cart's save data
-	std::vector<u8> saveData;
 	// The cart region. Only the CXI's region matters to us. Necessary to get past region locking
 	std::optional<Regions> region = std::nullopt;
 	std::vector<u8> smdh;
@ -76,7 +77,7 @@ struct NCCH {
 	bool hasExeFS() { return exeFS.size != 0; }
 	bool hasRomFS() { return romFS.size != 0; }
 	bool hasCode() { return codeFile.size() != 0; }
-	bool hasSaveData() { return saveData.size() != 0; }
+	bool hasSaveData() { return saveDataSize != 0; }

 	// Parse SMDH for region info and such. Returns false on failure, true on success
 	bool parseSMDH(const std::vector<u8> &smdh);
--- a/include/memory.hpp
+++ b/include/memory.hpp
@ -114,6 +114,7 @@ class Memory {
 		bool changeState = false;
 		bool changePerms = false;
 	};
+	using Handle = HorizonHandle;

 	u8* fcram;
 	u8* dspRam;  // Provided to us by Audio
@ -222,8 +223,14 @@ private:
 	u8* getFCRAM() { return fcram; }

 	enum class BatteryLevel {
-		Empty = 0, AlmostEmpty, OneBar, TwoBars, ThreeBars, FourBars
+		Empty = 0,
+		AlmostEmpty,
+		OneBar,
+		TwoBars,
+		ThreeBars,
+		FourBars,
 	};
+
 	u8 getBatteryState(bool adapterConnected, bool charging, BatteryLevel batteryLevel) {
 		u8 value = static_cast<u8>(batteryLevel) << 2; // Bits 2:4 are the battery level from 0 to 5
 		if (adapterConnected) value |= 1 << 0; // Bit 0 shows if the charger is connected
@ -287,5 +294,5 @@ private:

 	bool allocateMainThreadStack(u32 size);
 	Regions getConsoleRegion();
-	void copySharedFont(u8* ptr);
+	void copySharedFont(u8* ptr, u32 vaddr);
 };
--- a/include/panda_qt/cheats_window.hpp
+++ b/include/panda_qt/cheats_window.hpp
@ -1,6 +1,13 @@
 #pragma once

 #include <QAction>
+#include <QCheckBox>
+#include <QDialog>
+#include <QLabel>
+#include <QLineEdit>
+#include <QListWidget>
+#include <QPushButton>
+#include <QTextEdit>
 #include <QWidget>
 #include <filesystem>
 #include <memory>
@ -24,3 +31,60 @@ class CheatsWindow final : public QWidget {
 	std::filesystem::path cheatPath;
 	Emulator* emu;
 };
+
+struct CheatMetadata {
+	u32 handle = Cheats::badCheatHandle;
+	std::string name = "New cheat";
+	std::string code;
+	bool enabled = true;
+};
+
+class CheatEntryWidget : public QWidget {
+	Q_OBJECT
+
+  public:
+	CheatEntryWidget(Emulator* emu, CheatMetadata metadata, QListWidget* parent);
+
+	void Update() {
+		name->setText(metadata.name.c_str());
+		enabled->setChecked(metadata.enabled);
+		update();
+	}
+
+	void Remove() {
+		emu->getCheats().removeCheat(metadata.handle);
+		cheatList->takeItem(cheatList->row(listItem));
+		deleteLater();
+	}
+
+	const CheatMetadata& getMetadata() { return metadata; }
+	void setMetadata(const CheatMetadata& metadata) { this->metadata = metadata; }
+
+  private:
+	void checkboxChanged(int state);
+	void editClicked();
+
+	Emulator* emu;
+	CheatMetadata metadata;
+	u32 handle;
+	QLabel* name;
+	QCheckBox* enabled;
+	QListWidget* cheatList;
+	QListWidgetItem* listItem;
+};
+
+class CheatEditDialog : public QDialog {
+	Q_OBJECT
+
+  public:
+	CheatEditDialog(Emulator* emu, CheatEntryWidget& cheatEntry);
+
+	void accepted();
+	void rejected();
+
+  private:
+	Emulator* emu;
+	CheatEntryWidget& cheatEntry;
+	QTextEdit* codeEdit;
+	QLineEdit* nameEdit;
+};
--- a/include/panda_qt/elided_label.hpp
+++ b/include/panda_qt/elided_label.hpp
@ -0,0 +1,21 @@
+#pragma once
+#include <QFontMetrics>
+#include <QLabel>
+#include <QString>
+#include <QWidget>
+
+class ElidedLabel : public QLabel {
+	Q_OBJECT
+  public:
+	explicit ElidedLabel(Qt::TextElideMode elideMode = Qt::ElideLeft, QWidget* parent = nullptr);
+	explicit ElidedLabel(QString text, Qt::TextElideMode elideMode = Qt::ElideLeft, QWidget* parent = nullptr);
+	void setText(QString text);
+
+  protected:
+	void resizeEvent(QResizeEvent* event);
+
+  private:
+	void updateText();
+	QString m_text;
+	Qt::TextElideMode m_elideMode;
+};
--- a/include/panda_qt/main_window.hpp
+++ b/include/panda_qt/main_window.hpp
@ -17,7 +17,9 @@
 #include "panda_qt/about_window.hpp"
 #include "panda_qt/cheats_window.hpp"
 #include "panda_qt/config_window.hpp"
+#include "panda_qt/patch_window.hpp"
 #include "panda_qt/screen.hpp"
+#include "panda_qt/shader_editor.hpp"
 #include "panda_qt/text_editor.hpp"
 #include "services/hid.hpp"

@ -47,6 +49,8 @@ class MainWindow : public QMainWindow {
 		EditCheat,
 		PressTouchscreen,
 		ReleaseTouchscreen,
+		ReloadUbershader,
+		SetScreenSize,
 	};

 	// Tagged union representing our message queue messages
@ -78,6 +82,11 @@ class MainWindow : public QMainWindow {
 				u16 x;
 				u16 y;
 			} touchscreen;
+
+			struct {
+				u32 width;
+				u32 height;
+			} screenSize;
 		};
 	};

@ -90,13 +99,15 @@ class MainWindow : public QMainWindow {
 	std::mutex messageQueueMutex;
 	std::vector<EmulatorMessage> messageQueue;

+	QMenuBar* menuBar = nullptr;
 	InputMappings keyboardMappings;
-	ScreenWidget screen;
+	ScreenWidget* screen;
 	AboutWindow* aboutWindow;
 	ConfigWindow* configWindow;
 	CheatsWindow* cheatsEditor;
 	TextEditorWindow* luaEditor;
-	QMenuBar* menuBar = nullptr;
+	PatchWindow* patchWindow;
+	ShaderEditorWindow* shaderEditor;

 	// We use SDL's game controller API since it's the sanest API that supports as many controllers as possible
 	SDL_GameController* gameController = nullptr;
@ -108,17 +119,17 @@ class MainWindow : public QMainWindow {
 	void selectROM();
 	void dumpDspFirmware();
 	void dumpRomFS();
-	void openLuaEditor();
-	void openCheatsEditor();
 	void showAboutMenu();
 	void initControllers();
 	void pollControllers();
+	void setupControllerSensors(SDL_GameController* controller);
 	void sendMessage(const EmulatorMessage& message);
 	void dispatchMessage(const EmulatorMessage& message);

 	// Tracks whether we are using an OpenGL-backed renderer or a Vulkan-backed renderer
 	bool usingGL = false;
 	bool usingVk = false;
+	bool usingMtl = false;

 	// Variables to keep track of whether the user is controlling the 3DS analog stick with their keyboard
 	// This is done so when a gamepad is connected, we won't automatically override the 3DS analog stick settings with the gamepad's state
@ -130,11 +141,15 @@ class MainWindow : public QMainWindow {
 	MainWindow(QApplication* app, QWidget* parent = nullptr);
 	~MainWindow();

+	void closeEvent(QCloseEvent* event) override;
 	void keyPressEvent(QKeyEvent* event) override;
 	void keyReleaseEvent(QKeyEvent* event) override;
 	void mousePressEvent(QMouseEvent* event) override;
 	void mouseReleaseEvent(QMouseEvent* event) override;

 	void loadLuaScript(const std::string& code);
+	void reloadShader(const std::string& shader);
 	void editCheat(u32 handle, const std::vector<uint8_t>& cheat, const std::function<void(u32)>& callback);
+
+	void handleScreenResize(u32 width, u32 height);
 };
--- a/include/panda_qt/patch_window.hpp
+++ b/include/panda_qt/patch_window.hpp
@ -0,0 +1,31 @@
+#pragma once
+#include <QLabel>
+#include <QMessageBox>
+#include <QWidget>
+#include <filesystem>
+
+#include "panda_qt/elided_label.hpp"
+
+class PatchWindow final : public QWidget {
+	Q_OBJECT
+
+  public:
+	PatchWindow(QWidget* parent = nullptr);
+	~PatchWindow() = default;
+
+  private:
+	// Show a message box
+	// Title: Title of the message box to display
+	// Message: Message to display
+	// Icon: The type of icon (error, warning, information, etc) to display
+	// IconPath: If non-null, then a path to an icon in our assets to display on the OK button
+	void displayMessage(
+		const QString& title, const QString& message, QMessageBox::Icon icon = QMessageBox::Icon::Warning, const char* iconPath = nullptr
+	);
+
+	std::filesystem::path inputPath = "";
+	std::filesystem::path patchPath = "";
+
+	ElidedLabel* inputPathLabel = nullptr;
+	ElidedLabel* patchPathLabel = nullptr;
+};
--- a/include/panda_qt/screen.hpp
+++ b/include/panda_qt/screen.hpp
@ -1,5 +1,6 @@
 #pragma once
 #include <QWidget>
+#include <functional>
 #include <memory>

 #include "gl/context.h"
@ -10,15 +11,28 @@ class ScreenWidget : public QWidget {
 	Q_OBJECT

  public:
-	ScreenWidget(QWidget* parent = nullptr);
+	using ResizeCallback = std::function<void(u32, u32)>;
+
+	ScreenWidget(ResizeCallback resizeCallback, QWidget* parent = nullptr);
+	void resizeEvent(QResizeEvent* event) override;
+	// Called by the emulator thread for resizing the actual GL surface, since the emulator thread owns the GL context
+	void resizeSurface(u32 width, u32 height);
+
 	GL::Context* getGLContext() { return glContext.get(); }

 	// Dimensions of our output surface
 	u32 surfaceWidth = 0;
 	u32 surfaceHeight = 0;
+	WindowInfo windowInfo;
+
+	// Cached "previous" dimensions, used when resizing our window
+	u32 previousWidth = 0;
+	u32 previousHeight = 0;

  private:
 	std::unique_ptr<GL::Context> glContext = nullptr;
+	ResizeCallback resizeCallback;
+
 	bool createGLContext();

 	qreal devicePixelRatioFromScreen() const;
--- a/include/panda_qt/shader_editor.hpp
+++ b/include/panda_qt/shader_editor.hpp
@ -0,0 +1,27 @@
+#pragma once
+
+#include <QApplication>
+#include <QDialog>
+#include <QWidget>
+#include <string>
+
+#include "zep.h"
+#include "zep/mode_repl.h"
+#include "zep/regress.h"
+
+class ShaderEditorWindow : public QDialog {
+	Q_OBJECT
+
+  private:
+	Zep::ZepWidget_Qt zepWidget;
+	Zep::IZepReplProvider replProvider;
+	static constexpr float fontSize = 14.0f;
+
+  public:
+	// Whether this backend supports shader editor
+	bool supported = true;
+
+	ShaderEditorWindow(QWidget* parent, const std::string& filename, const std::string& initialText);
+	void setText(const std::string& text) { zepWidget.GetEditor().GetMRUBuffer()->SetText(text); }
+	void setEnable(bool enable);
+};
--- a/include/panda_sdl/frontend_sdl.hpp
+++ b/include/panda_sdl/frontend_sdl.hpp
@ -23,6 +23,8 @@ class FrontendSDL {
 	SDL_GameController* gameController = nullptr;
 	InputMappings keyboardMappings;

+	u32 windowWidth = 400;
+	u32 windowHeight = 480;
 	int gameControllerID;
 	bool programRunning = true;
 	
@ -35,4 +37,6 @@ class FrontendSDL {
 	// And so the user can still use the keyboard to control the analog
 	bool keyboardAnalogX = false;
 	bool keyboardAnalogY = false;
+
+	void setupControllerSensors(SDL_GameController* controller);
 };
--- a/include/renderdoc.hpp
+++ b/include/renderdoc.hpp
@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+#include <string>
+
+#include "helpers.hpp"
+
+#ifdef PANDA3DS_ENABLE_RENDERDOC
+namespace Renderdoc {
+	// Loads renderdoc dynamic library module.
+	void loadRenderdoc();
+
+	// Begins a capture if a renderdoc instance is attached.
+	void startCapture();
+
+	// Ends current renderdoc capture.
+	void endCapture();
+
+	// Triggers capturing process.
+	void triggerCapture();
+
+	// Sets output directory for captures
+	void setOutputDir(const std::string& path, const std::string& prefix);
+
+	// Returns whether we've compiled with Renderdoc support
+	static constexpr bool isSupported() { return true; }
+}  // namespace Renderdoc
+#else
+namespace Renderdoc {
+	static void loadRenderdoc() {}
+	static void startCapture() { Helpers::panic("Tried to start a Renderdoc capture while support for renderdoc is disabled") }
+	static void endCapture() { Helpers::panic("Tried to end a Renderdoc capture while support for renderdoc is disabled") }
+	static void triggerCapture() { Helpers::panic("Tried to trigger a Renderdoc capture while support for renderdoc is disabled") }
+	static void setOutputDir(const std::string& path, const std::string& prefix) {}
+	static constexpr bool isSupported() { return false; }
+}  // namespace Renderdoc
+#endif
+
+namespace Renderdoc {
+	// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
+	struct Scope {
+		Scope() { Renderdoc::startCapture(); }
+		~Scope() { Renderdoc::endCapture(); }
+
+		Scope(const Scope&) = delete;
+		Scope& operator=(const Scope&) = delete;
+
+		Scope(Scope&&) = delete;
+		Scope& operator=(const Scope&&) = delete;
+	};
+
+	// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
+	// trigger on its own and take a capture
+	struct InstantScope {
+		InstantScope() {
+			Renderdoc::triggerCapture();
+			Renderdoc::startCapture();
+		}
+
+		~InstantScope() { Renderdoc::endCapture(); }
+		
+		InstantScope(const InstantScope&) = delete;
+		InstantScope& operator=(const InstantScope&) = delete;
+
+		InstantScope(InstantScope&&) = delete;
+		InstantScope& operator=(const InstantScope&&) = delete;
+	};
+}  // namespace Renderdoc
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@ -1,8 +1,10 @@
 #pragma once
 #include <array>
-#include <span>
 #include <optional>
+#include <span>
+#include <string>

+#include "PICA/draw_acceleration.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "helpers.hpp"
@ -16,12 +18,16 @@ enum class RendererType : s8 {
 	Null = 0,
 	OpenGL = 1,
 	Vulkan = 2,
-	Software = 3,
+	Metal = 3,
+	Software = 4,
 };

-class GPU;
+struct EmulatorConfig;
 struct SDL_Window;

+class GPU;
+class ShaderUnit;
+
 class Renderer {
  protected:
 	GPU& gpu;
@ -45,6 +51,8 @@ class Renderer {
 	u32 outputWindowWidth = 400;
 	u32 outputWindowHeight = 240 * 2;

+	EmulatorConfig* emulatorConfig = nullptr;
+
  public:
 	Renderer(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs);
 	virtual ~Renderer();
@ -66,6 +74,19 @@ class Renderer {
 	// This function does things like write back or cache necessary state before we delete our context
 	virtual void deinitGraphicsContext() = 0;

+	// Functions for hooking up the renderer core to the frontend's shader editor for editing ubershaders in real time
+	// SupportsShaderReload: Indicates whether the backend offers ubershader reload support or not
+	// GetUbershader/SetUbershader: Gets or sets the renderer's current ubershader
+	virtual bool supportsShaderReload() { return false; }
+	virtual std::string getUbershader() { return ""; }
+	virtual void setUbershader(const std::string& shader) {}
+
+	// This function is called on every draw call before parsing vertex data.
+	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
+	// ubershaders and shadergen, and so on.
+	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
+
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
 	virtual void initGraphicsContext(GL::Context* context) { Helpers::panic("Tried to initialize incompatible renderer with GL context"); }
@ -91,4 +112,6 @@ class Renderer {
 		outputWindowWidth = width;
 		outputWindowHeight = height;
 	}
+
+	void setConfig(EmulatorConfig* config) { emulatorConfig = config; }
 };
--- a/include/renderer_gl/gl_driver.hpp
+++ b/include/renderer_gl/gl_driver.hpp
@ -0,0 +1,12 @@
+#pragma once
+
+// Information about our OpenGL/OpenGL ES driver that we should keep track of
+// Stuff like whether specific extensions are supported, and potentially things like OpenGL context information
+namespace OpenGL {
+	struct Driver {
+		bool supportsExtFbFetch = false;
+		bool supportsArmFbFetch = false;
+
+		bool supportFbFetch() const { return supportsExtFbFetch || supportsArmFbFetch; }
+	};
+}  // namespace OpenGL
--- a/include/renderer_gl/gl_state.hpp
+++ b/include/renderer_gl/gl_state.hpp
@ -38,11 +38,14 @@ struct GLStateManager {
 	
 	GLuint stencilMask;
 	GLuint boundVAO;
-	GLuint boundVBO;
 	GLuint currentProgram;
+	GLuint boundUBO;

 	GLenum depthFunc;
 	GLenum logicOp;
+	GLenum blendEquationRGB, blendEquationAlpha;
+	GLenum blendFuncSourceRGB, blendFuncSourceAlpha;
+	GLenum blendFuncDestRGB, blendFuncDestAlpha;

 	void reset();
 	void resetBlend();
@ -51,7 +54,7 @@ struct GLStateManager {
 	void resetColourMask();
 	void resetDepth();
 	void resetVAO();
-	void resetVBO();
+	void resetBuffers();
 	void resetProgram();
 	void resetScissor();
 	void resetStencil();
@ -169,13 +172,6 @@ struct GLStateManager {
 		}
 	}

-	void bindVBO(GLuint handle) {
-		if (boundVBO != handle) {
-			boundVBO = handle;
-			glBindBuffer(GL_ARRAY_BUFFER, handle);
-		}
-	}
-
 	void useProgram(GLuint handle) {
 		if (currentProgram != handle) {
 			currentProgram = handle;
@ -183,8 +179,14 @@ struct GLStateManager {
 		}
 	}

+	void bindUBO(GLuint handle) {
+		if (boundUBO != handle) {
+			boundUBO = handle;
+			glBindBuffer(GL_UNIFORM_BUFFER, boundUBO);
+		}
+	}
+
 	void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
-	void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
 	void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }

 	void setColourMask(bool r, bool g, bool b, bool a) {
@ -224,6 +226,41 @@ struct GLStateManager {
 	}

 	void setDepthFunc(OpenGL::DepthFunc func) { setDepthFunc(static_cast<GLenum>(func)); }
+
+	// Counterpart to glBlendEquationSeparate
+	void setBlendEquation(GLenum modeRGB, GLenum modeAlpha) {
+		if (blendEquationRGB != modeRGB || blendEquationAlpha != modeAlpha) {
+			blendEquationRGB = modeRGB;
+			blendEquationAlpha = modeAlpha;
+
+			glBlendEquationSeparate(modeRGB, modeAlpha);
+		}
+	}
+
+	// Counterpart to glBlendFuncSeparate
+	void setBlendFunc(GLenum sourceRGB, GLenum destRGB, GLenum sourceAlpha, GLenum destAlpha) {
+		if (blendFuncSourceRGB != sourceRGB || blendFuncDestRGB != destRGB || blendFuncSourceAlpha != sourceAlpha ||
+			blendFuncDestAlpha != destAlpha) {
+
+			blendFuncSourceRGB = sourceRGB;
+			blendFuncDestRGB = destRGB;
+			blendFuncSourceAlpha = sourceAlpha;
+			blendFuncDestAlpha = destAlpha;
+
+			glBlendFuncSeparate(sourceRGB, destRGB,sourceAlpha, destAlpha);
+		}
+	}
+
+	// Counterpart to regular glBlendEquation
+	void setBlendEquation(GLenum mode) { setBlendEquation(mode, mode); }
+
+	void setBlendEquation(OpenGL::BlendEquation modeRGB, OpenGL::BlendEquation modeAlpha) {
+		setBlendEquation(static_cast<GLenum>(modeRGB), static_cast<GLenum>(modeAlpha));
+	}
+
+	void setBlendEquation(OpenGL::BlendEquation mode) {
+		setBlendEquation(static_cast<GLenum>(mode));
+	}
 };

 static_assert(std::is_trivially_constructible<GLStateManager>(), "OpenGL State Manager class is not trivially constructible!");
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -1,11 +1,23 @@
 #pragma once

 #include <array>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
 #include <span>
+#include <unordered_map>
+#include <utility>

 #include "PICA/float_types.hpp"
+#include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
+#include "gl_driver.hpp"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@ -22,27 +34,48 @@ class RendererGL final : public Renderer {
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;

-	OpenGL::VertexArray vao;
+	// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
+	OpenGL::VertexArray defaultVAO;
+	// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
+	OpenGL::VertexArray hwShaderVAO;
 	OpenGL::VertexBuffer vbo;

-	// TEV configuration uniform locations
-	GLint textureEnvSourceLoc = -1;
-	GLint textureEnvOperandLoc = -1;
-	GLint textureEnvCombinerLoc = -1;
-	GLint textureEnvColorLoc = -1;
-	GLint textureEnvScaleLoc = -1;
+	// Data 
+	struct {
+		// TEV configuration uniform locations
+		GLint textureEnvSourceLoc = -1;
+		GLint textureEnvOperandLoc = -1;
+		GLint textureEnvCombinerLoc = -1;
+		GLint textureEnvColorLoc = -1;
+		GLint textureEnvScaleLoc = -1;

-	// Uniform of PICA registers
-	GLint picaRegLoc = -1;
+		// Uniform of PICA registers
+		GLint picaRegLoc = -1;

-	// Depth configuration uniform locations
-	GLint depthOffsetLoc = -1;
-	GLint depthScaleLoc = -1;
-	GLint depthmapEnableLoc = -1;
+		// Depth configuration uniform locations
+		GLint depthOffsetLoc = -1;
+		GLint depthScaleLoc = -1;
+		GLint depthmapEnableLoc = -1;
+	} ubershaderData;

 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
+	// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
+	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
+	bool usingShortIndices = false;
+
+	// Set by prepareForDraw, metadata for indexed renders
+	GLuint minimumIndex = 0;
+	GLuint maximumIndex = 0;
+	void* hwIndexBufferOffset = nullptr;
+
+	// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
+	u32 previousAttributeMask = 0;
+
+	// Cached pointer to the current vertex shader when using HW accelerated shaders
+	OpenGL::Shader* generatedVertexShader = nullptr;

 	SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
 	SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -53,25 +86,81 @@ class RendererGL final : public Renderer {
 	OpenGL::VertexBuffer dummyVBO;

 	OpenGL::Texture screenTexture;
-	GLuint lightLUTTextureArray;
+	OpenGL::Texture LUTTexture;
 	OpenGL::Framebuffer screenFramebuffer;
 	OpenGL::Texture blankTexture;
+	// The "default" vertex shader to use when using specialized shaders but not PICA vertex shader -> GLSL recompilation
+	// We can compile this once and then link it with all other generated fragment shaders
+	OpenGL::Shader defaultShadergenVs;
+	GLuint shadergenFragmentUBO;
+	// UBO for uploading the PICA uniforms when using hw shaders
+	GLuint hwShaderUniformUBO;
+
+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
+	// Cache of fixed attribute values so that we don't do any duplicate updates
+	std::array<std::array<float, 4>, 16> fixedAttrValues;
+
+	// Cached recompiled fragment shader
+	struct CachedProgram {
+		OpenGL::Program program;
+	};
+
+	struct ShaderCache {
+		std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
+		std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
+
+		// Program cache indexed by GLuints for the vertex and fragment shader to use
+		// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
+		std::unordered_map<u64, CachedProgram> programCache;
+
+		void clear() {
+			for (auto& it : programCache) {
+				CachedProgram& cachedProgram = it.second;
+				cachedProgram.program.free();
+			}
+
+			for (auto& it : vertexShaderCache) {
+				if (it.second.has_value()) {
+					it.second->free();
+				}
+			}
+
+			for (auto& it : fragmentShaderCache) {
+				it.second.free();
+			}
+
+			programCache.clear();
+			vertexShaderCache.clear();
+			fragmentShaderCache.clear();
+		}
+	};
+	ShaderCache shaderCache;

 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
+	OpenGL::Program& getSpecializedShader();
+
+	PICA::ShaderGen::FragmentGenerator fragShaderGen;
+	OpenGL::Driver driverInfo;

 	MAKE_LOG_FUNCTION(log, rendererLogger)
 	void setupBlending();
 	void setupStencilTest(bool stencilEnable);
 	void bindDepthBuffer();
-	void setupTextureEnvState();
+	void setupUbershaderTexEnv();
 	void bindTexturesToSlots();
 	void updateLightingLUT();
+	void updateFogLUT();
 	void initGraphicsContextInternal();

+	void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
+
  public:
 	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
-		: Renderer(gpu, internalRegs, externalRegs) {}
+		: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
 	~RendererGL() override;

 	void reset() override;
@ -82,12 +171,18 @@ class RendererGL final : public Renderer {
 	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
 	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;             // Draw the given vertices
 	void deinitGraphicsContext() override;
+
+	virtual bool supportsShaderReload() override { return true; }
+	virtual std::string getUbershader() override;
+	virtual void setUbershader(const std::string& shader) override;
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);

 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
 	void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
 	void resetStateManager() { gl.reset(); }
+	void initUbershader(OpenGL::Program& program);

 #ifdef PANDA3DS_FRONTEND_QT
 	virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override { initGraphicsContextInternal(); }
@ -95,4 +190,4 @@ class RendererGL final : public Renderer {

 	// Take a screenshot of the screen and store it in a file
 	void screenshot(const std::string& name) override;
-};
+};
--- a/include/renderer_gl/surface_cache.hpp
+++ b/include/renderer_gl/surface_cache.hpp
@ -19,8 +19,6 @@ template <typename SurfaceType, size_t capacity, bool evictOnOverflow = false>
 class SurfaceCache {
    // Vanilla std::optional can't hold actual references
    using OptionalRef = std::optional<std::reference_wrapper<SurfaceType>>;
-    static_assert(std::is_same<SurfaceType, ColourBuffer>() || std::is_same<SurfaceType, DepthBuffer>()  ||
-        std::is_same<SurfaceType, Texture>(), "Invalid surface type");

    size_t size;
    size_t evictionIndex;
--- a/include/renderer_mtl/mtl_blit_pipeline_cache.hpp
+++ b/include/renderer_mtl/mtl_blit_pipeline_cache.hpp
@ -0,0 +1,74 @@
+#pragma once
+
+#include <map>
+
+#include "objc_helper.hpp"
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+	struct BlitPipelineHash {
+		// Formats
+		ColorFmt colorFmt;
+		DepthFmt depthFmt;
+	};
+
+	// This pipeline only caches the pipeline with all of its color and depth attachment variations
+	class BlitPipelineCache {
+	  public:
+		BlitPipelineCache() = default;
+
+		~BlitPipelineCache() {
+			reset();
+			vertexFunction->release();
+			fragmentFunction->release();
+		}
+
+		void set(MTL::Device* dev, MTL::Function* vert, MTL::Function* frag) {
+			device = dev;
+			vertexFunction = vert;
+			fragmentFunction = frag;
+		}
+
+		MTL::RenderPipelineState* get(BlitPipelineHash hash) {
+			u8 intHash = ((u8)hash.colorFmt << 3) | (u8)hash.depthFmt;
+			auto& pipeline = pipelineCache[intHash];
+			if (!pipeline) {
+				MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
+				desc->setVertexFunction(vertexFunction);
+				desc->setFragmentFunction(fragmentFunction);
+
+				auto colorAttachment = desc->colorAttachments()->object(0);
+				colorAttachment->setPixelFormat(toMTLPixelFormatColor(hash.colorFmt));
+
+				desc->setDepthAttachmentPixelFormat(toMTLPixelFormatDepth(hash.depthFmt));
+
+				NS::Error* error = nullptr;
+				desc->setLabel(toNSString("Blit pipeline"));
+				pipeline = device->newRenderPipelineState(desc, &error);
+				if (error) {
+					Helpers::panic("Error creating blit pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+				}
+
+				desc->release();
+			}
+
+			return pipeline;
+		}
+
+		void reset() {
+			for (auto& pair : pipelineCache) {
+				pair.second->release();
+			}
+			pipelineCache.clear();
+		}
+
+	  private:
+		std::map<u8, MTL::RenderPipelineState*> pipelineCache;
+
+		MTL::Device* device;
+		MTL::Function* vertexFunction;
+		MTL::Function* fragmentFunction;
+	};
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_command_encoder.hpp
+++ b/include/renderer_mtl/mtl_command_encoder.hpp
@ -0,0 +1,56 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+
+namespace Metal {
+	struct RenderState {
+		MTL::RenderPipelineState* renderPipelineState = nullptr;
+		MTL::DepthStencilState* depthStencilState = nullptr;
+		MTL::Texture* textures[3] = {nullptr};
+		MTL::SamplerState* samplerStates[3] = {nullptr};
+	};
+
+	class CommandEncoder {
+	  public:
+		void newRenderCommandEncoder(MTL::RenderCommandEncoder* rce) {
+			renderCommandEncoder = rce;
+
+			// Reset the render state
+			renderState = RenderState{};
+		}
+
+		// Resource binding
+		void setRenderPipelineState(MTL::RenderPipelineState* renderPipelineState) {
+			if (renderPipelineState != renderState.renderPipelineState) {
+				renderCommandEncoder->setRenderPipelineState(renderPipelineState);
+				renderState.renderPipelineState = renderPipelineState;
+			}
+		}
+
+		void setDepthStencilState(MTL::DepthStencilState* depthStencilState) {
+			if (depthStencilState != renderState.depthStencilState) {
+				renderCommandEncoder->setDepthStencilState(depthStencilState);
+				renderState.depthStencilState = depthStencilState;
+			}
+		}
+
+		void setFragmentTexture(MTL::Texture* texture, u32 index) {
+			if (texture != renderState.textures[index]) {
+				renderCommandEncoder->setFragmentTexture(texture, index);
+				renderState.textures[index] = texture;
+			}
+		}
+
+		void setFragmentSamplerState(MTL::SamplerState* samplerState, u32 index) {
+			if (samplerState != renderState.samplerStates[index]) {
+				renderCommandEncoder->setFragmentSamplerState(samplerState, index);
+				renderState.samplerStates[index] = samplerState;
+			}
+		}
+
+	  private:
+		MTL::RenderCommandEncoder* renderCommandEncoder = nullptr;
+
+		RenderState renderState;
+	};
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_common.hpp
+++ b/include/renderer_mtl/mtl_common.hpp
@ -0,0 +1,6 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+
+#define GET_HELPER_TEXTURE_BINDING(binding) (30 - binding)
+#define GET_HELPER_SAMPLER_STATE_BINDING(binding) (15 - binding)
--- a/include/renderer_mtl/mtl_depth_stencil_cache.hpp
+++ b/include/renderer_mtl/mtl_depth_stencil_cache.hpp
@ -0,0 +1,80 @@
+#pragma once
+
+#include <map>
+
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+	struct DepthStencilHash {
+		u32 stencilConfig;
+		u16 stencilOpConfig;
+		bool depthStencilWrite;
+		u8 depthFunc;
+	};
+
+	class DepthStencilCache {
+	  public:
+		DepthStencilCache() = default;
+
+		~DepthStencilCache() { reset(); }
+
+		void set(MTL::Device* dev) { device = dev; }
+
+		MTL::DepthStencilState* get(DepthStencilHash hash) {
+			u64 intHash =
+				((u64)hash.depthStencilWrite << 56) | ((u64)hash.depthFunc << 48) | ((u64)hash.stencilConfig << 16) | (u64)hash.stencilOpConfig;
+			auto& depthStencilState = depthStencilCache[intHash];
+			if (!depthStencilState) {
+				MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init();
+				desc->setDepthWriteEnabled(hash.depthStencilWrite);
+				desc->setDepthCompareFunction(toMTLCompareFunc(hash.depthFunc));
+
+				const bool stencilEnable = Helpers::getBit<0>(hash.stencilConfig);
+				MTL::StencilDescriptor* stencilDesc = nullptr;
+				if (stencilEnable) {
+					const u8 stencilFunc = Helpers::getBits<4, 3>(hash.stencilConfig);
+					const u8 stencilRefMask = Helpers::getBits<24, 8>(hash.stencilConfig);
+
+					const u32 stencilBufferMask = hash.depthStencilWrite ? Helpers::getBits<8, 8>(hash.stencilConfig) : 0;
+
+					const u8 stencilFailOp = Helpers::getBits<0, 3>(hash.stencilOpConfig);
+					const u8 depthFailOp = Helpers::getBits<4, 3>(hash.stencilOpConfig);
+					const u8 passOp = Helpers::getBits<8, 3>(hash.stencilOpConfig);
+
+					stencilDesc = MTL::StencilDescriptor::alloc()->init();
+					stencilDesc->setStencilFailureOperation(toMTLStencilOperation(stencilFailOp));
+					stencilDesc->setDepthFailureOperation(toMTLStencilOperation(depthFailOp));
+					stencilDesc->setDepthStencilPassOperation(toMTLStencilOperation(passOp));
+					stencilDesc->setStencilCompareFunction(toMTLCompareFunc(stencilFunc));
+					stencilDesc->setReadMask(stencilRefMask);
+					stencilDesc->setWriteMask(stencilBufferMask);
+
+					desc->setFrontFaceStencil(stencilDesc);
+					desc->setBackFaceStencil(stencilDesc);
+				}
+
+				depthStencilState = device->newDepthStencilState(desc);
+
+				desc->release();
+				if (stencilDesc) {
+					stencilDesc->release();
+				}
+			}
+
+			return depthStencilState;
+		}
+
+		void reset() {
+			for (auto& pair : depthStencilCache) {
+				pair.second->release();
+			}
+			depthStencilCache.clear();
+		}
+
+	  private:
+		std::map<u64, MTL::DepthStencilState*> depthStencilCache;
+		MTL::Device* device;
+	};
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_draw_pipeline_cache.hpp
+++ b/include/renderer_mtl/mtl_draw_pipeline_cache.hpp
@ -0,0 +1,162 @@
+#pragma once
+
+#include <map>
+
+#include "objc_helper.hpp"
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+	struct DrawFragmentFunctionHash {
+		u32 lightingConfig1;   // 32 bits (TODO: check this)
+		bool lightingEnabled;  // 1 bit
+		u8 lightingNumLights;  // 3 bits
+		//                                 |   ref    | func |  on  |
+		u16 alphaControl;  // 12 bits (mask:  11111111   0111   0001)
+	};
+
+	inline bool operator<(const DrawFragmentFunctionHash& l, const DrawFragmentFunctionHash& r) {
+		if (!l.lightingEnabled && r.lightingEnabled) return true;
+		if (l.lightingNumLights < r.lightingNumLights) return true;
+		if (l.lightingConfig1 < r.lightingConfig1) return true;
+		if (l.alphaControl < r.alphaControl) return true;
+
+		return false;
+	}
+
+	struct DrawPipelineHash {  // 56 bits
+		// Formats
+		ColorFmt colorFmt;  // 3 bits
+		DepthFmt depthFmt;  // 3 bits
+
+		// Blending
+		bool blendEnabled;  // 1 bit
+		//                                 |    functions     |   aeq    |   ceq    |
+		u32 blendControl;   // 22 bits (mask:  1111111111111111   00000111   00000111)
+		u8 colorWriteMask;  // 4 bits
+
+		DrawFragmentFunctionHash fragHash;
+	};
+
+	inline bool operator<(const DrawPipelineHash& l, const DrawPipelineHash& r) {
+		if ((u32)l.colorFmt < (u32)r.colorFmt) return true;
+		if ((u32)l.depthFmt < (u32)r.depthFmt) return true;
+		if (!l.blendEnabled && r.blendEnabled) return true;
+		if (l.blendControl < r.blendControl) return true;
+		if (l.colorWriteMask < r.colorWriteMask) return true;
+		if (l.fragHash < r.fragHash) return true;
+
+		return false;
+	}
+
+	// This pipeline only caches the pipeline with all of its color and depth attachment variations
+	class DrawPipelineCache {
+	  public:
+		DrawPipelineCache() = default;
+
+		~DrawPipelineCache() {
+			reset();
+			vertexDescriptor->release();
+			vertexFunction->release();
+		}
+
+		void set(MTL::Device* dev, MTL::Library* lib, MTL::Function* vert, MTL::VertexDescriptor* vertDesc) {
+			device = dev;
+			library = lib;
+			vertexFunction = vert;
+			vertexDescriptor = vertDesc;
+		}
+
+		MTL::RenderPipelineState* get(DrawPipelineHash hash) {
+			auto& pipeline = pipelineCache[hash];
+
+			if (!pipeline) {
+				auto& fragmentFunction = fragmentFunctionCache[hash.fragHash];
+				if (!fragmentFunction) {
+					MTL::FunctionConstantValues* constants = MTL::FunctionConstantValues::alloc()->init();
+					constants->setConstantValue(&hash.fragHash.lightingEnabled, MTL::DataTypeBool, NS::UInteger(0));
+					constants->setConstantValue(&hash.fragHash.lightingNumLights, MTL::DataTypeUChar, NS::UInteger(1));
+					constants->setConstantValue(&hash.fragHash.lightingConfig1, MTL::DataTypeUInt, NS::UInteger(2));
+					constants->setConstantValue(&hash.fragHash.alphaControl, MTL::DataTypeUShort, NS::UInteger(3));
+
+					NS::Error* error = nullptr;
+					fragmentFunction = library->newFunction(NS::String::string("fragmentDraw", NS::ASCIIStringEncoding), constants, &error);
+					if (error) {
+						Helpers::panic("Error creating draw fragment function: %s", error->description()->cString(NS::ASCIIStringEncoding));
+					}
+					constants->release();
+				}
+
+				MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
+				desc->setVertexFunction(vertexFunction);
+				desc->setFragmentFunction(fragmentFunction);
+				desc->setVertexDescriptor(vertexDescriptor);
+
+				auto colorAttachment = desc->colorAttachments()->object(0);
+				colorAttachment->setPixelFormat(toMTLPixelFormatColor(hash.colorFmt));
+				MTL::ColorWriteMask writeMask = 0;
+				if (hash.colorWriteMask & 0x1) writeMask |= MTL::ColorWriteMaskRed;
+				if (hash.colorWriteMask & 0x2) writeMask |= MTL::ColorWriteMaskGreen;
+				if (hash.colorWriteMask & 0x4) writeMask |= MTL::ColorWriteMaskBlue;
+				if (hash.colorWriteMask & 0x8) writeMask |= MTL::ColorWriteMaskAlpha;
+				colorAttachment->setWriteMask(writeMask);
+				if (hash.blendEnabled) {
+					const u8 rgbEquation = hash.blendControl & 0x7;
+					const u8 alphaEquation = Helpers::getBits<8, 3>(hash.blendControl);
+
+					// Get blending functions
+					const u8 rgbSourceFunc = Helpers::getBits<16, 4>(hash.blendControl);
+					const u8 rgbDestFunc = Helpers::getBits<20, 4>(hash.blendControl);
+					const u8 alphaSourceFunc = Helpers::getBits<24, 4>(hash.blendControl);
+					const u8 alphaDestFunc = Helpers::getBits<28, 4>(hash.blendControl);
+
+					colorAttachment->setBlendingEnabled(true);
+					colorAttachment->setRgbBlendOperation(toMTLBlendOperation(rgbEquation));
+					colorAttachment->setAlphaBlendOperation(toMTLBlendOperation(alphaEquation));
+					colorAttachment->setSourceRGBBlendFactor(toMTLBlendFactor(rgbSourceFunc));
+					colorAttachment->setDestinationRGBBlendFactor(toMTLBlendFactor(rgbDestFunc));
+					colorAttachment->setSourceAlphaBlendFactor(toMTLBlendFactor(alphaSourceFunc));
+					colorAttachment->setDestinationAlphaBlendFactor(toMTLBlendFactor(alphaDestFunc));
+				}
+
+				MTL::PixelFormat depthFormat = toMTLPixelFormatDepth(hash.depthFmt);
+				desc->setDepthAttachmentPixelFormat(depthFormat);
+				if (hash.depthFmt == DepthFmt::Depth24Stencil8) desc->setStencilAttachmentPixelFormat(depthFormat);
+
+				NS::Error* error = nullptr;
+				desc->setLabel(toNSString("Draw pipeline"));
+				pipeline = device->newRenderPipelineState(desc, &error);
+				if (error) {
+					Helpers::panic("Error creating draw pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+				}
+
+				desc->release();
+			}
+
+			return pipeline;
+		}
+
+		void reset() {
+			for (auto& pair : pipelineCache) {
+				pair.second->release();
+			}
+			pipelineCache.clear();
+
+			for (auto& pair : fragmentFunctionCache) {
+				pair.second->release();
+			}
+			fragmentFunctionCache.clear();
+		}
+
+	  private:
+		std::map<DrawPipelineHash, MTL::RenderPipelineState*> pipelineCache;
+		std::map<DrawFragmentFunctionHash, MTL::Function*> fragmentFunctionCache;
+
+		MTL::Device* device;
+		MTL::Library* library;
+		MTL::Function* vertexFunction;
+		MTL::VertexDescriptor* vertexDescriptor;
+	};
+
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_lut_texture.hpp
+++ b/include/renderer_mtl/mtl_lut_texture.hpp
@ -0,0 +1,20 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+
+namespace Metal {
+
+class LutTexture {
+public:
+    LutTexture(MTL::Device* device, MTL::TextureType type, MTL::PixelFormat pixelFormat, u32 width, u32 height, const char* name);
+    ~LutTexture();
+    u32 getNextIndex();
+
+    MTL::Texture* getTexture() { return texture; }
+    u32 getCurrentIndex() { return currentIndex; }
+private:
+    MTL::Texture* texture;
+    u32 currentIndex = 0;
+};
+
+} // namespace Metal
--- a/include/renderer_mtl/mtl_render_target.hpp
+++ b/include/renderer_mtl/mtl_render_target.hpp
@ -0,0 +1,91 @@
+#pragma once
+#include <Metal/Metal.hpp>
+#include <array>
+#include <string>
+
+#include "boost/icl/interval.hpp"
+#include "helpers.hpp"
+#include "math_util.hpp"
+#include "objc_helper.hpp"
+#include "opengl.hpp"
+#include "pica_to_mtl.hpp"
+
+template <typename T>
+using Interval = boost::icl::right_open_interval<T>;
+
+namespace Metal {
+	template <typename Format_t>
+	struct RenderTarget {
+		MTL::Device* device;
+
+		u32 location;
+		Format_t format;
+		OpenGL::uvec2 size;
+		bool valid;
+
+		// Range of VRAM taken up by buffer
+		Interval<u32> range;
+
+		MTL::Texture* texture = nullptr;
+
+		RenderTarget() : valid(false) {}
+
+		RenderTarget(MTL::Device* dev, u32 loc, Format_t format, u32 x, u32 y, bool valid = true)
+			: device(dev), location(loc), format(format), size({x, y}), valid(valid) {
+			u64 endLoc = (u64)loc + sizeInBytes();
+			// Check if start and end are valid here
+			range = Interval<u32>(loc, (u32)endLoc);
+		}
+
+		Math::Rect<u32> getSubRect(u32 inputAddress, u32 width, u32 height) {
+			const u32 startOffset = (inputAddress - location) / sizePerPixel(format);
+			const u32 x0 = (startOffset % (size.x() * 8)) / 8;
+			const u32 y0 = (startOffset / (size.x() * 8)) * 8;
+			return Math::Rect<u32>{x0, size.y() - y0, x0 + width, size.y() - height - y0};
+		}
+
+		// For 2 textures to "match" we only care about their locations, formats, and dimensions to match
+		// For other things, such as filtering mode, etc, we can just switch the attributes of the cached texture
+		bool matches(RenderTarget& other) {
+			return location == other.location && format == other.format && size.x() == other.size.x() && size.y() == other.size.y();
+		}
+
+		void allocate() {
+			MTL::PixelFormat pixelFormat = MTL::PixelFormatInvalid;
+			if (std::is_same<Format_t, PICA::ColorFmt>::value) {
+				pixelFormat = PICA::toMTLPixelFormatColor((PICA::ColorFmt)format);
+			} else if (std::is_same<Format_t, PICA::DepthFmt>::value) {
+				pixelFormat = PICA::toMTLPixelFormatDepth((PICA::DepthFmt)format);
+			} else {
+				panic("Invalid format type");
+			}
+
+			MTL::TextureDescriptor* descriptor = MTL::TextureDescriptor::alloc()->init();
+			descriptor->setTextureType(MTL::TextureType2D);
+			descriptor->setPixelFormat(pixelFormat);
+			descriptor->setWidth(size.u());
+			descriptor->setHeight(size.v());
+			descriptor->setUsage(MTL::TextureUsageRenderTarget | MTL::TextureUsageShaderRead);
+			descriptor->setStorageMode(MTL::StorageModePrivate);
+			texture = device->newTexture(descriptor);
+			texture->setLabel(toNSString(
+				std::string(std::is_same<Format_t, PICA::ColorFmt>::value ? "Color" : "Depth") + " render target " + std::to_string(size.u()) + "x" +
+				std::to_string(size.v())
+			));
+			descriptor->release();
+		}
+
+		void free() {
+			valid = false;
+
+			if (texture) {
+				texture->release();
+			}
+		}
+
+		u64 sizeInBytes() { return (size_t)size.x() * (size_t)size.y() * PICA::sizePerPixel(format); }
+	};
+
+	using ColorRenderTarget = RenderTarget<PICA::ColorFmt>;
+	using DepthStencilRenderTarget = RenderTarget<PICA::DepthFmt>;
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_texture.hpp
+++ b/include/renderer_mtl/mtl_texture.hpp
@ -0,0 +1,73 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+#include <array>
+#include <string>
+
+#include "PICA/regs.hpp"
+#include "boost/icl/interval.hpp"
+#include "helpers.hpp"
+#include "math_util.hpp"
+#include "opengl.hpp"
+#include "renderer_mtl/pica_to_mtl.hpp"
+
+template <typename T>
+using Interval = boost::icl::right_open_interval<T>;
+
+namespace Metal {
+	struct Texture {
+		MTL::Device* device;
+
+		u32 location;
+		u32 config;  // Magnification/minification filter, wrapping configs, etc
+		PICA::TextureFmt format;
+		OpenGL::uvec2 size;
+		bool valid;
+
+		// Range of VRAM taken up by buffer
+		Interval<u32> range;
+
+		PICA::PixelFormatInfo formatInfo;
+		MTL::Texture* texture = nullptr;
+		MTL::SamplerState* sampler = nullptr;
+
+		Texture() : valid(false) {}
+
+		Texture(MTL::Device* dev, u32 loc, PICA::TextureFmt format, u32 x, u32 y, u32 config, bool valid = true)
+			: device(dev), location(loc), format(format), size({x, y}), config(config), valid(valid) {
+			u64 endLoc = (u64)loc + sizeInBytes();
+			// Check if start and end are valid here
+			range = Interval<u32>(loc, (u32)endLoc);
+		}
+
+		// For 2 textures to "match" we only care about their locations, formats, and dimensions to match
+		// For other things, such as filtering mode, etc, we can just switch the attributes of the cached texture
+		bool matches(Texture& other) {
+			return location == other.location && format == other.format && size.x() == other.size.x() && size.y() == other.size.y();
+		}
+
+		void allocate();
+		void setNewConfig(u32 newConfig);
+		void decodeTexture(std::span<const u8> data);
+		void free();
+		u64 sizeInBytes();
+
+		u8 decodeTexelU8(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+		u16 decodeTexelU16(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+		u32 decodeTexelU32(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+
+		// Get the morton interleave offset of a texel based on its U and V values
+		static u32 mortonInterleave(u32 u, u32 v);
+		// Get the byte offset of texel (u, v) in the texture
+		static u32 getSwizzledOffset(u32 u, u32 v, u32 width, u32 bytesPerPixel);
+		static u32 getSwizzledOffset_4bpp(u32 u, u32 v, u32 width);
+
+		// Returns the format of this texture as a string
+		std::string_view formatToString() { return PICA::textureFormatToString(format); }
+
+		// Returns the texel at coordinates (u, v) of an ETC1(A4) texture
+		// TODO: Make hasAlpha a template parameter
+		u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span<const u8> data);
+		u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData);
+	};
+}  // namespace Metal
--- a/include/renderer_mtl/mtl_vertex_buffer_cache.hpp
+++ b/include/renderer_mtl/mtl_vertex_buffer_cache.hpp
@ -0,0 +1,83 @@
+#pragma once
+
+#include <cstring>
+
+#include "helpers.hpp"
+#include "pica_to_mtl.hpp"
+
+
+using namespace PICA;
+
+namespace Metal {
+	struct BufferHandle {
+		MTL::Buffer* buffer;
+		usize offset;
+	};
+
+	class VertexBufferCache {
+		// 128MB buffer for caching vertex data
+		static constexpr usize CACHE_BUFFER_SIZE = 128 * 1024 * 1024;
+
+	  public:
+		VertexBufferCache() = default;
+
+		~VertexBufferCache() {
+			endFrame();
+			buffer->release();
+		}
+
+		void set(MTL::Device* dev) {
+			device = dev;
+			create();
+		}
+
+		void endFrame() {
+			ptr = 0;
+			for (auto buffer : additionalAllocations) {
+				buffer->release();
+			}
+			additionalAllocations.clear();
+		}
+
+		BufferHandle get(const void* data, usize size) {
+			// If the vertex buffer is too large, just create a new one
+			if (ptr + size > CACHE_BUFFER_SIZE) {
+				MTL::Buffer* newBuffer = device->newBuffer(data, size, MTL::ResourceStorageModeShared);
+				newBuffer->setLabel(toNSString("Additional vertex buffer"));
+				additionalAllocations.push_back(newBuffer);
+				Helpers::warn("Vertex buffer doesn't have enough space, creating a new buffer");
+
+				return BufferHandle{newBuffer, 0};
+			}
+
+			// Copy the data into the buffer
+			std::memcpy((char*)buffer->contents() + ptr, data, size);
+
+			auto oldPtr = ptr;
+			ptr += size;
+
+			return BufferHandle{buffer, oldPtr};
+		}
+
+		void reset() {
+			endFrame();
+
+			if (buffer) {
+				buffer->release();
+				create();
+			}
+		}
+
+	  private:
+		MTL::Buffer* buffer = nullptr;
+		usize ptr = 0;
+		std::vector<MTL::Buffer*> additionalAllocations;
+
+		MTL::Device* device;
+
+		void create() {
+			buffer = device->newBuffer(CACHE_BUFFER_SIZE, MTL::ResourceStorageModeShared);
+			buffer->setLabel(toNSString("Shared vertex buffer"));
+		}
+	};
+}  // namespace Metal
--- a/include/renderer_mtl/objc_helper.hpp
+++ b/include/renderer_mtl/objc_helper.hpp
@ -0,0 +1,12 @@
+#pragma once
+
+#include <string>
+
+#include "mtl_common.hpp"
+
+namespace Metal {
+	dispatch_data_t createDispatchData(const void* data, size_t size);
+}  // namespace Metal
+
+// Cast from std::string to NS::String*
+inline NS::String* toNSString(const std::string& str) { return NS::String::string(str.c_str(), NS::ASCIIStringEncoding); }
--- a/include/renderer_mtl/pica_to_mtl.hpp
+++ b/include/renderer_mtl/pica_to_mtl.hpp
@ -0,0 +1,152 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+
+#include "PICA/regs.hpp"
+
+namespace PICA {
+	struct PixelFormatInfo {
+		MTL::PixelFormat pixelFormat;
+		size_t bytesPerTexel;
+	};
+
+	constexpr PixelFormatInfo pixelFormatInfos[14] = {
+		{MTL::PixelFormatRGBA8Unorm, 4},   // RGBA8
+		{MTL::PixelFormatRGBA8Unorm, 4},   // RGB8
+		{MTL::PixelFormatBGR5A1Unorm, 2},  // RGBA5551
+		{MTL::PixelFormatB5G6R5Unorm, 2},  // RGB565
+		{MTL::PixelFormatABGR4Unorm, 2},   // RGBA4
+		{MTL::PixelFormatRGBA8Unorm, 4},   // IA8
+		{MTL::PixelFormatRG8Unorm, 2},     // RG8
+		{MTL::PixelFormatRGBA8Unorm, 4},   // I8
+		{MTL::PixelFormatA8Unorm, 1},      // A8
+		{MTL::PixelFormatABGR4Unorm, 2},   // IA4
+		{MTL::PixelFormatABGR4Unorm, 2},   // I4
+		{MTL::PixelFormatA8Unorm, 1},      // A4
+		{MTL::PixelFormatRGBA8Unorm, 4},   // ETC1
+		{MTL::PixelFormatRGBA8Unorm, 4},   // ETC1A4
+	};
+
+	inline PixelFormatInfo getPixelFormatInfo(TextureFmt format) { return pixelFormatInfos[static_cast<int>(format)]; }
+
+	inline MTL::PixelFormat toMTLPixelFormatColor(ColorFmt format) {
+		switch (format) {
+			case ColorFmt::RGBA8: return MTL::PixelFormatRGBA8Unorm;
+			case ColorFmt::RGB8: return MTL::PixelFormatRGBA8Unorm;
+			case ColorFmt::RGBA5551: return MTL::PixelFormatRGBA8Unorm;  // TODO: use MTL::PixelFormatBGR5A1Unorm?
+			case ColorFmt::RGB565: return MTL::PixelFormatRGBA8Unorm;    // TODO: use MTL::PixelFormatB5G6R5Unorm?
+			case ColorFmt::RGBA4: return MTL::PixelFormatABGR4Unorm;
+		}
+	}
+
+	inline MTL::PixelFormat toMTLPixelFormatDepth(DepthFmt format) {
+		switch (format) {
+			case DepthFmt::Depth16: return MTL::PixelFormatDepth16Unorm;
+			case DepthFmt::Unknown1: return MTL::PixelFormatInvalid;
+			case DepthFmt::Depth24:
+				return MTL::PixelFormatDepth32Float;  // Metal does not support 24-bit depth formats
+			// Apple sillicon doesn't support 24-bit depth buffers, so we use 32-bit instead
+			case DepthFmt::Depth24Stencil8: return MTL::PixelFormatDepth32Float_Stencil8;
+		}
+	}
+
+	inline MTL::CompareFunction toMTLCompareFunc(u8 func) {
+		switch (func) {
+			case 0: return MTL::CompareFunctionNever;
+			case 1: return MTL::CompareFunctionAlways;
+			case 2: return MTL::CompareFunctionEqual;
+			case 3: return MTL::CompareFunctionNotEqual;
+			case 4: return MTL::CompareFunctionLess;
+			case 5: return MTL::CompareFunctionLessEqual;
+			case 6: return MTL::CompareFunctionGreater;
+			case 7: return MTL::CompareFunctionGreaterEqual;
+			default: Helpers::panic("Unknown compare function %u", func);
+		}
+
+		return MTL::CompareFunctionAlways;
+	}
+
+	inline MTL::BlendOperation toMTLBlendOperation(u8 op) {
+		switch (op) {
+			case 0: return MTL::BlendOperationAdd;
+			case 1: return MTL::BlendOperationSubtract;
+			case 2: return MTL::BlendOperationReverseSubtract;
+			case 3: return MTL::BlendOperationMin;
+			case 4: return MTL::BlendOperationMax;
+			case 5: return MTL::BlendOperationAdd;  // Unused (same as 0)
+			case 6: return MTL::BlendOperationAdd;  // Unused (same as 0)
+			case 7: return MTL::BlendOperationAdd;  // Unused (same as 0)
+			default: Helpers::panic("Unknown blend operation %u", op);
+		}
+
+		return MTL::BlendOperationAdd;
+	}
+
+	inline MTL::BlendFactor toMTLBlendFactor(u8 factor) {
+		switch (factor) {
+			case 0: return MTL::BlendFactorZero;
+			case 1: return MTL::BlendFactorOne;
+			case 2: return MTL::BlendFactorSourceColor;
+			case 3: return MTL::BlendFactorOneMinusSourceColor;
+			case 4: return MTL::BlendFactorDestinationColor;
+			case 5: return MTL::BlendFactorOneMinusDestinationColor;
+			case 6: return MTL::BlendFactorSourceAlpha;
+			case 7: return MTL::BlendFactorOneMinusSourceAlpha;
+			case 8: return MTL::BlendFactorDestinationAlpha;
+			case 9: return MTL::BlendFactorOneMinusDestinationAlpha;
+			case 10: return MTL::BlendFactorBlendColor;
+			case 11: return MTL::BlendFactorOneMinusBlendColor;
+			case 12: return MTL::BlendFactorBlendAlpha;
+			case 13: return MTL::BlendFactorOneMinusBlendAlpha;
+			case 14: return MTL::BlendFactorSourceAlphaSaturated;
+			case 15: return MTL::BlendFactorOne;  // Undocumented
+			default: Helpers::panic("Unknown blend factor %u", factor);
+		}
+
+		return MTL::BlendFactorOne;
+	}
+
+	inline MTL::StencilOperation toMTLStencilOperation(u8 op) {
+		switch (op) {
+			case 0: return MTL::StencilOperationKeep;
+			case 1: return MTL::StencilOperationZero;
+			case 2: return MTL::StencilOperationReplace;
+			case 3: return MTL::StencilOperationIncrementClamp;
+			case 4: return MTL::StencilOperationDecrementClamp;
+			case 5: return MTL::StencilOperationInvert;
+			case 6: return MTL::StencilOperationIncrementWrap;
+			case 7: return MTL::StencilOperationDecrementWrap;
+			default: Helpers::panic("Unknown stencil operation %u", op);
+		}
+
+		return MTL::StencilOperationKeep;
+	}
+
+	inline MTL::PrimitiveType toMTLPrimitiveType(PrimType primType) {
+		switch (primType) {
+			case PrimType::TriangleList: return MTL::PrimitiveTypeTriangle;
+			case PrimType::TriangleStrip: return MTL::PrimitiveTypeTriangleStrip;
+			case PrimType::TriangleFan:
+				Helpers::warn("Triangle fans are not supported on Metal, using triangles instead");
+				return MTL::PrimitiveTypeTriangle;
+			case PrimType::GeometryPrimitive:
+				return MTL::PrimitiveTypeTriangle;
+		}
+	}
+
+	inline MTL::SamplerAddressMode toMTLSamplerAddressMode(u8 addrMode) {
+		switch (addrMode) {
+			case 0: return MTL::SamplerAddressModeClampToEdge;
+			case 1: return MTL::SamplerAddressModeClampToBorderColor;
+			case 2: return MTL::SamplerAddressModeRepeat;
+			case 3: return MTL::SamplerAddressModeMirrorRepeat;
+			case 4: return MTL::SamplerAddressModeClampToEdge;
+			case 5: return MTL::SamplerAddressModeClampToBorderColor;
+			case 6: return MTL::SamplerAddressModeRepeat;
+			case 7: return MTL::SamplerAddressModeRepeat;
+			default: Helpers::panic("Unknown sampler address mode %u", addrMode);
+		}
+
+		return MTL::SamplerAddressModeClampToEdge;
+	}
+}  // namespace PICA
--- a/include/renderer_mtl/renderer_mtl.hpp
+++ b/include/renderer_mtl/renderer_mtl.hpp
@ -0,0 +1,207 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+#include <QuartzCore/QuartzCore.hpp>
+
+#include "mtl_blit_pipeline_cache.hpp"
+#include "mtl_command_encoder.hpp"
+#include "mtl_depth_stencil_cache.hpp"
+#include "mtl_draw_pipeline_cache.hpp"
+#include "mtl_lut_texture.hpp"
+#include "mtl_render_target.hpp"
+#include "mtl_texture.hpp"
+#include "mtl_vertex_buffer_cache.hpp"
+#include "renderer.hpp"
+
+
+// HACK: use the OpenGL cache
+#include "../renderer_gl/surface_cache.hpp"
+
+class GPU;
+
+struct Color4 {
+	float r, g, b, a;
+};
+
+class RendererMTL final : public Renderer {
+  public:
+	RendererMTL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs);
+	~RendererMTL() override;
+
+	void reset() override;
+	void display() override;
+	void initGraphicsContext(SDL_Window* window) override;
+	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
+	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
+	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
+	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
+	void screenshot(const std::string& name) override;
+	void deinitGraphicsContext() override;
+
+#ifdef PANDA3DS_FRONTEND_QT
+	virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override {}
+#endif
+
+  private:
+	CA::MetalLayer* metalLayer;
+
+	MTL::Device* device;
+	MTL::CommandQueue* commandQueue;
+
+	Metal::CommandEncoder commandEncoder;
+
+	// Libraries
+	MTL::Library* library;
+
+	// Caches
+	SurfaceCache<Metal::ColorRenderTarget, 16, true> colorRenderTargetCache;
+	SurfaceCache<Metal::DepthStencilRenderTarget, 16, true> depthStencilRenderTargetCache;
+	SurfaceCache<Metal::Texture, 256, true> textureCache;
+	Metal::BlitPipelineCache blitPipelineCache;
+	Metal::DrawPipelineCache drawPipelineCache;
+	Metal::DepthStencilCache depthStencilCache;
+	Metal::VertexBufferCache vertexBufferCache;
+
+	// Resources
+	MTL::SamplerState* nearestSampler;
+	MTL::SamplerState* linearSampler;
+	MTL::Texture* nullTexture;
+	MTL::DepthStencilState* defaultDepthStencilState;
+
+	Metal::LutTexture* lutLightingTexture;
+	Metal::LutTexture* lutFogTexture;
+
+	// Pipelines
+	MTL::RenderPipelineState* displayPipeline;
+	// MTL::RenderPipelineState* copyToLutTexturePipeline;
+
+	// Clears
+	std::map<MTL::Texture*, Color4> colorClearOps;
+	std::map<MTL::Texture*, float> depthClearOps;
+	std::map<MTL::Texture*, u8> stencilClearOps;
+
+	// Active state
+	MTL::CommandBuffer* commandBuffer = nullptr;
+	MTL::RenderCommandEncoder* renderCommandEncoder = nullptr;
+	MTL::Texture* lastColorTexture = nullptr;
+	MTL::Texture* lastDepthTexture = nullptr;
+
+	// Debug
+	std::string nextRenderPassName;
+
+	void createCommandBufferIfNeeded() {
+		if (!commandBuffer) {
+			commandBuffer = commandQueue->commandBuffer();
+		}
+	}
+
+	void endRenderPass() {
+		if (renderCommandEncoder) {
+			renderCommandEncoder->endEncoding();
+			renderCommandEncoder = nullptr;
+		}
+	}
+
+	void beginRenderPassIfNeeded(
+		MTL::RenderPassDescriptor* renderPassDescriptor, bool doesClears, MTL::Texture* colorTexture, MTL::Texture* depthTexture = nullptr
+	);
+
+	void commitCommandBuffer() {
+		if (renderCommandEncoder) {
+			renderCommandEncoder->endEncoding();
+			renderCommandEncoder->release();
+			renderCommandEncoder = nullptr;
+		}
+		if (commandBuffer) {
+			commandBuffer->commit();
+			// HACK
+			commandBuffer->waitUntilCompleted();
+			commandBuffer->release();
+			commandBuffer = nullptr;
+		}
+	}
+
+	template <typename AttachmentT, typename ClearDataT, typename GetAttachmentT, typename SetClearDataT>
+	inline void clearAttachment(
+		MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture, ClearDataT clearData, GetAttachmentT getAttachment,
+		SetClearDataT setClearData
+	) {
+		bool beginRenderPass = (renderPassDescriptor == nullptr);
+		if (!renderPassDescriptor) {
+			renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
+		}
+
+		AttachmentT* attachment = getAttachment(renderPassDescriptor);
+		attachment->setTexture(texture);
+		setClearData(attachment, clearData);
+		attachment->setLoadAction(MTL::LoadActionClear);
+		attachment->setStoreAction(MTL::StoreActionStore);
+
+		if (beginRenderPass) {
+			if (std::is_same<AttachmentT, MTL::RenderPassColorAttachmentDescriptor>::value)
+				beginRenderPassIfNeeded(renderPassDescriptor, true, texture);
+			else
+				beginRenderPassIfNeeded(renderPassDescriptor, true, nullptr, texture);
+		}
+	}
+
+	template <typename AttachmentT, typename ClearDataT, typename GetAttachmentT, typename SetClearDataT>
+	inline bool clearAttachment(
+		MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture, std::map<MTL::Texture*, ClearDataT>& clearOps,
+		GetAttachmentT getAttachment, SetClearDataT setClearData
+	) {
+		auto it = clearOps.find(texture);
+		if (it != clearOps.end()) {
+			clearAttachment<AttachmentT>(renderPassDescriptor, texture, it->second, getAttachment, setClearData);
+			clearOps.erase(it);
+			return true;
+		}
+
+		if (renderPassDescriptor) {
+			AttachmentT* attachment = getAttachment(renderPassDescriptor);
+			attachment->setTexture(texture);
+			attachment->setLoadAction(MTL::LoadActionLoad);
+			attachment->setStoreAction(MTL::StoreActionStore);
+		}
+
+		return false;
+	}
+
+	bool clearColor(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+		return clearAttachment<MTL::RenderPassColorAttachmentDescriptor, Color4>(
+			renderPassDescriptor, texture, colorClearOps,
+			[](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->colorAttachments()->object(0); },
+			[](auto attachment, auto& color) { attachment->setClearColor(MTL::ClearColor(color.r, color.g, color.b, color.a)); }
+		);
+	}
+
+	bool clearDepth(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+		return clearAttachment<MTL::RenderPassDepthAttachmentDescriptor, float>(
+			renderPassDescriptor, texture, depthClearOps,
+			[](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->depthAttachment(); },
+			[](auto attachment, auto& depth) { attachment->setClearDepth(depth); }
+		);
+	}
+
+	bool clearStencil(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+		return clearAttachment<MTL::RenderPassStencilAttachmentDescriptor, u8>(
+			renderPassDescriptor, texture, stencilClearOps,
+			[](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->stencilAttachment(); },
+			[](auto attachment, auto& stencil) { attachment->setClearStencil(stencil); }
+		);
+	}
+
+	std::optional<Metal::ColorRenderTarget> getColorRenderTarget(
+		u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true
+	);
+	Metal::DepthStencilRenderTarget& getDepthRenderTarget();
+	Metal::Texture& getTexture(Metal::Texture& tex);
+	void setupTextureEnvState(MTL::RenderCommandEncoder* encoder);
+	void bindTexturesToSlots();
+	void updateLightingLUT(MTL::RenderCommandEncoder* encoder);
+	void updateFogLUT(MTL::RenderCommandEncoder* encoder);
+	void textureCopyImpl(
+		Metal::ColorRenderTarget& srcFramebuffer, Metal::ColorRenderTarget& destFramebuffer, const Math::Rect<u32>& srcRect,
+		const Math::Rect<u32>& destRect
+	);
+};
--- a/include/scheduler.hpp
+++ b/include/scheduler.hpp
@ -11,7 +11,8 @@ struct Scheduler {
 		VBlank = 0,          // End of frame event
 		UpdateTimers = 1,    // Update kernel timer objects
 		RunDSP = 2,          // Make the emulated DSP run for one audio frame
-		Panic = 3,           // Dummy event that is always pending and should never be triggered (Timestamp = UINT64_MAX)
+		SignalY2R = 3,       // Signal that a Y2R conversion has finished
+		Panic = 4,           // Dummy event that is always pending and should never be triggered (Timestamp = UINT64_MAX)
 		TotalNumberOfEvents  // How many event types do we have in total?
 	};
 	static constexpr usize totalNumberOfEvents = static_cast<usize>(EventType::TotalNumberOfEvents);
--- a/include/sdl_sensors.hpp
+++ b/include/sdl_sensors.hpp
@ -0,0 +1,38 @@
+#pragma once
+
+#include <cmath>
+#include <glm/glm.hpp>
+
+#include "helpers.hpp"
+#include "services/hid.hpp"
+
+// Convert SDL sensor readings to 3DS format
+// We use the same code for Android as well, since the values we get from Android are in the same format as SDL (m/s^2 for acceleration, rad/s for
+// rotation)
+namespace Sensors::SDL {
+    // Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
+    // Returns [pitch, roll, yaw]
+    static glm::vec3 convertRotation(glm::vec3 rotation) {
+        // Annoyingly, Android doesn't support the <numbers> header yet so we define pi ourselves
+        static constexpr double pi = 3.141592653589793;
+        // Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
+        constexpr float scale = 180.f / pi * HIDService::gyroscopeCoeff;
+        // The axes are also inverted, so invert scale before the multiplication.
+        return rotation * -scale;
+    }
+
+    static glm::vec3 convertAcceleration(float* data) {
+        // Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
+        // At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
+        // This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
+        static constexpr float accelMax = 9.f;
+        // We define standard gravity(g) ourself instead of using the SDL one in order for the code to work on Android too.
+        static constexpr float standardGravity = 9.80665f;
+
+        s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
+        s16 y = std::clamp<s16>(s16(data[1] / (standardGravity * accelMax) * 930.f - 350.f), -930, +930);
+        s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
+
+        return glm::vec3(x, y, z);
+    }
+}  // namespace Sensors::SDL
--- a/include/services/ac.hpp
+++ b/include/services/ac.hpp
@ -8,6 +8,8 @@
 #include "result/result.hpp"

 class ACService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::AC;
 	Memory& mem;
 	MAKE_LOG_FUNCTION(log, acLogger)
--- a/include/services/act.hpp
+++ b/include/services/act.hpp
@ -6,6 +6,8 @@
 #include "result/result.hpp"

 class ACTService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::ACT;
 	Memory& mem;
 	MAKE_LOG_FUNCTION(log, actLogger)
@ -15,7 +17,7 @@ class ACTService {
 	void generateUUID(u32 messagePointer);
 	void getAccountDataBlock(u32 messagePointer);

-public:
+  public:
 	ACTService(Memory& mem) : mem(mem) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/am.hpp
+++ b/include/services/am.hpp
@ -6,6 +6,8 @@
 #include "result/result.hpp"

 class AMService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::AM;
 	Memory& mem;
 	MAKE_LOG_FUNCTION(log, amLogger)
@ -15,7 +17,7 @@ class AMService {
 	void getPatchTitleInfo(u32 messagePointer);
 	void listTitleInfo(u32 messagePointer);

-public:
+  public:
 	AMService(Memory& mem) : mem(mem) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/apt.hpp
+++ b/include/services/apt.hpp
@ -12,7 +12,8 @@
 class Kernel;

 enum class ConsoleModel : u32 {
-	Old3DS, New3DS
+	Old3DS,
+	New3DS,
 };

 // https://www.3dbrew.org/wiki/NS_and_APT_Services#Command
@ -41,6 +42,8 @@ namespace APT::Transitions {
 }

 class APTService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::APT;
 	Memory& mem;
 	Kernel& kernel;
@ -99,7 +102,7 @@ class APTService {

 	u32 screencapPostPermission;

-public:
+  public:
 	APTService(Memory& mem, Kernel& kernel) : mem(mem), kernel(kernel), appletManager(mem) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/boss.hpp
+++ b/include/services/boss.hpp
@ -6,6 +6,8 @@
 #include "result/result.hpp"

 class BOSSService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::BOSS;
 	Memory& mem;
 	MAKE_LOG_FUNCTION(log, bossLogger)
@ -17,7 +19,7 @@ class BOSSService {
 	void getNewArrivalFlag(u32 messagePointer);
 	void getNsDataIdList(u32 messagePointer, u32 commandWord);
 	void getOptoutFlag(u32 messagePointer);
-	void getStorageEntryInfo(u32 messagePointer); // Unknown what this is, name taken from Citra
+	void getStorageEntryInfo(u32 messagePointer);  // Unknown what this is, name taken from Citra
 	void getTaskIdList(u32 messagePointer);
 	void getTaskInfo(u32 messagePointer);
 	void getTaskServiceStatus(u32 messagePointer);
@ -35,7 +37,8 @@ class BOSSService {
 	void unregisterTask(u32 messagePointer);

 	s8 optoutFlag;
-public:
+
+  public:
 	BOSSService(Memory& mem) : mem(mem) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/cam.hpp
+++ b/include/services/cam.hpp
@ -12,6 +12,7 @@
 class Kernel;

 class CAMService {
+	using Handle = HorizonHandle;
 	using Event = std::optional<Handle>;

 	struct Port {
--- a/include/services/cecd.hpp
+++ b/include/services/cecd.hpp
@ -1,5 +1,6 @@
 #pragma once
 #include <optional>
+
 #include "helpers.hpp"
 #include "kernel_types.hpp"
 #include "logger.hpp"
@ -9,6 +10,8 @@
 class Kernel;

 class CECDService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::CECD;
 	Memory& mem;
 	Kernel& kernel;
@ -20,7 +23,7 @@ class CECDService {
 	void getInfoEventHandle(u32 messagePointer);
 	void openAndRead(u32 messagePointer);

-public:
+  public:
 	CECDService(Memory& mem, Kernel& kernel) : mem(mem), kernel(kernel) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/cfg.hpp
+++ b/include/services/cfg.hpp
@ -1,5 +1,6 @@
 #pragma once
 #include <cstring>
+
 #include "helpers.hpp"
 #include "logger.hpp"
 #include "memory.hpp"
@ -7,8 +8,10 @@
 #include "result/result.hpp"

 class CFGService {
+	using Handle = HorizonHandle;
+
 	Memory& mem;
-	CountryCodes country = CountryCodes::US; // Default to USA
+	CountryCodes country = CountryCodes::US;  // Default to USA
 	MAKE_LOG_FUNCTION(log, cfgLogger)

 	void writeStringU16(u32 pointer, const std::u16string& string);
@ -27,12 +30,12 @@ class CFGService {

 	void getConfigInfo(u32 output, u32 blockID, u32 size, u32 permissionMask);

-public:
+  public:
 	enum class Type {
-		U, // cfg:u
-		I, // cfg:i
-		S, // cfg:s
-		NOR, // cfg:nor
+		U,    // cfg:u
+		I,    // cfg:i
+		S,    // cfg:s
+		NOR,  // cfg:nor
 	};

 	CFGService(Memory& mem) : mem(mem) {}
--- a/include/services/csnd.hpp
+++ b/include/services/csnd.hpp
@ -10,6 +10,8 @@
 class Kernel;

 class CSNDService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::CSND;
 	Memory& mem;
 	Kernel& kernel;
@ -30,7 +32,5 @@ class CSNDService {
 	void reset();
 	void handleSyncRequest(u32 messagePointer);

-	void setSharedMemory(u8* ptr) {
-		sharedMemory = ptr;
-	}
+	void setSharedMemory(u8* ptr) { sharedMemory = ptr; }
 };
--- a/include/services/dlp_srvr.hpp
+++ b/include/services/dlp_srvr.hpp
@ -8,6 +8,8 @@
 // Please forgive me for how everything in this file is named
 // "dlp:SRVR" is not a nice name to work with
 class DlpSrvrService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::DLP_SRVR;
 	Memory& mem;
 	MAKE_LOG_FUNCTION(log, dlpSrvrLogger)
@ -15,7 +17,7 @@ class DlpSrvrService {
 	// Service commands
 	void isChild(u32 messagePointer);

-public:
+  public:
 	DlpSrvrService(Memory& mem) : mem(mem) {}
 	void reset();
 	void handleSyncRequest(u32 messagePointer);
--- a/include/services/dsp.hpp
+++ b/include/services/dsp.hpp
@ -14,6 +14,8 @@
 class Kernel;

 class DSPService {
+	using Handle = HorizonHandle;
+
 	Handle handle = KernelHandles::DSP;
 	Memory& mem;
 	Kernel& kernel;
--- a/include/services/fonts.hpp
+++ b/include/services/fonts.hpp
@ -0,0 +1,84 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.h
+
+#pragma once
+
+#include <memory>
+
+#include "helpers.hpp"
+#include "swap.hpp"
+
+namespace HLE::Fonts {
+	struct CFNT {
+		u8 magic[4];
+		u16_le endianness;
+		u16_le headerSize;
+		u32_le version;
+		u32_le fileSize;
+		u32_le numBlocks;
+	};
+
+	struct SectionHeader {
+		u8 magic[4];
+		u32_le sectionSize;
+	};
+
+	struct FINF {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 fontType;
+		u8 lineFeed;
+		u16_le alterCharIndex;
+		u8 default_width[3];
+		u8 encoding;
+		u32_le tglpOffset;
+		u32_le cwdhOffset;
+		u32_le cmapOffset;
+		u8 height;
+		u8 width;
+		u8 ascent;
+		u8 reserved;
+	};
+
+	struct TGLP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u8 cellWidth;
+		u8 cellHeight;
+		u8 baselinePosition;
+		u8 maxCharacterWidth;
+		u32_le sheetSize;
+		u16_le numSheets;
+		u16_le sheetImageFormat;
+		u16_le numColumns;
+		u16_le numRows;
+		u16_le sheetWidth;
+		u16_le sheetHeight;
+		u32_le sheetDataOffset;
+	};
+
+	struct CMAP {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le codeBegin;
+		u16_le codeEnd;
+		u16_le mappingMethod;
+		u16_le reserved;
+		u32_le nextCmapOffset;
+	};
+
+	struct CWDH {
+		u8 magic[4];
+		u32_le sectionSize;
+		u16_le startIndex;
+		u16_le endIndex;
+		u32_le nextCwdhOffset;
+	};
+
+	// Relocates the internal addresses of the BCFNT Shared Font to the new base. The current base will
+	// be auto-detected based on the file headers.
+	void relocateSharedFont(u8* sharedFont, u32 newAddress);
+}  // namespace HLE::Fonts
--- a/Show more
+++ b/Show more