diff --git a/.gitignore b/.gitignore
index 528462ad..817786a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,5 +64,9 @@ fb.bat
 *.elf
 *.smdh
 
+# Compiled Metal shader files
+*.ir
+*.metallib
+
 config.toml
 CMakeSettings.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2865a3f8..31fdd9f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,16 +26,17 @@ endif()
 
 if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security")
-endif() 
+endif()
 
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-interference-size")
-endif() 
+endif()
 
 option(DISABLE_PANIC_DEV "Make a build with fewer and less intrusive asserts" ON)
 option(GPU_DEBUG_INFO "Enable additional GPU debugging info" OFF)
 option(ENABLE_OPENGL "Enable OpenGL rendering backend" ON)
 option(ENABLE_VULKAN "Enable Vulkan rendering backend" ON)
+option(ENABLE_METAL "Enable Metal rendering backend (if available)" ON)
 option(ENABLE_LTO "Enable link-time optimization" OFF)
 option(ENABLE_TESTS "Compile unit-tests" OFF)
 option(ENABLE_USER_BUILD "Make a user-facing build. These builds have various assertions disabled, LTO, and more" OFF)
@@ -55,11 +56,6 @@ if(BUILD_LIBRETRO_CORE)
     add_compile_definitions(__LIBRETRO__)
 endif()
 
-if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND ENABLE_USER_BUILD)
-    # Disable stack buffer overflow checks in user builds
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /GS-")
-endif() 
-
 add_library(AlberCore STATIC)
 
 include_directories(${PROJECT_SOURCE_DIR}/include/)
@@ -240,7 +236,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/services/mic.hpp include/services/cecd.hpp include/services/ac.hpp
                  include/services/am.hpp include/services/boss.hpp include/services/frd.hpp include/services/nim.hpp
                  include/fs/archive_ext_save_data.hpp include/fs/archive_ncch.hpp include/services/mcu/mcu_hwc.hpp
-                 include/colour.hpp include/services/y2r.hpp include/services/cam.hpp include/services/ssl.hpp 
+                 include/colour.hpp include/services/y2r.hpp include/services/cam.hpp include/services/ssl.hpp
                  include/services/ldr_ro.hpp include/ipc.hpp include/services/act.hpp include/services/nfc.hpp
                  include/system_models.hpp include/services/dlp_srvr.hpp include/PICA/dynapica/pica_recs.hpp
                  include/PICA/dynapica/x64_regs.hpp include/PICA/dynapica/vertex_loader_rec.hpp include/PICA/dynapica/shader_rec.hpp
@@ -251,7 +247,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/config.hpp include/services/ir_user.hpp include/http_server.hpp include/cheats.hpp
                  include/action_replay.hpp include/renderer_sw/renderer_sw.hpp include/compiler_builtins.hpp
                  include/fs/romfs.hpp include/fs/ivfc.hpp include/discord_rpc.hpp include/services/http.hpp include/result/result_cfg.hpp
-                 include/applets/applet.hpp include/applets/mii_selector.hpp include/math_util.hpp include/services/soc.hpp 
+                 include/applets/applet.hpp include/applets/mii_selector.hpp include/math_util.hpp include/services/soc.hpp
                  include/services/news_u.hpp include/applets/software_keyboard.hpp include/applets/applet_manager.hpp include/fs/archive_user_save_data.hpp
                  include/services/amiibo_device.hpp include/services/nfc_types.hpp include/swap.hpp include/services/csnd.hpp include/services/nwm_uds.hpp
                  include/fs/archive_system_save_data.hpp include/lua_manager.hpp include/memory_mapped_file.hpp include/hydra_icon.hpp
@@ -260,7 +256,6 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
-                 include/sdl_gyro.hpp
 )
 
 cmrc_add_resource_library(
@@ -418,8 +413,75 @@ if(ENABLE_VULKAN)
     target_link_libraries(AlberCore PRIVATE Vulkan::Vulkan resources_renderer_vk)
 endif()
 
+if(ENABLE_METAL AND APPLE)
+    set(RENDERER_MTL_INCLUDE_FILES include/renderer_mtl/renderer_mtl.hpp
+        include/renderer_mtl/mtl_depth_stencil_cache.hpp
+        include/renderer_mtl/mtl_blit_pipeline_cache.hpp
+        include/renderer_mtl/mtl_draw_pipeline_cache.hpp
+        include/renderer_mtl/mtl_render_target.hpp
+        include/renderer_mtl/mtl_texture.hpp
+        include/renderer_mtl/mtl_vertex_buffer_cache.hpp
+        include/renderer_mtl/pica_to_mtl.hpp
+        include/renderer_mtl/objc_helper.hpp
+    )
+
+    set(RENDERER_MTL_SOURCE_FILES src/core/renderer_mtl/metal_cpp_impl.cpp
+        src/core/renderer_mtl/renderer_mtl.cpp
+        src/core/renderer_mtl/mtl_texture.cpp
+        src/core/renderer_mtl/mtl_etc1.cpp
+        src/core/renderer_mtl/objc_helper.mm
+        src/host_shaders/metal_shaders.metal
+        src/host_shaders/metal_copy_to_lut_texture.metal
+    )
+
+    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_MTL_INCLUDE_FILES})
+    source_group("Source Files\\Core\\Metal Renderer" FILES ${RENDERER_MTL_SOURCE_FILES})
+
+    set(RENDERER_MTL_HOST_SHADERS_SOURCES)
+    function (add_metal_shader SHADER)
+        set(SHADER_SOURCE "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.metal")
+        set(SHADER_IR "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.ir")
+        set(SHADER_METALLIB "${CMAKE_SOURCE_DIR}/src/host_shaders/${SHADER}.metallib")
+        # TODO: only include sources in debug builds
+        add_custom_command(
+            OUTPUT ${SHADER_IR}
+            COMMAND xcrun -sdk macosx metal -gline-tables-only -frecord-sources -o ${SHADER_IR} -c ${SHADER_SOURCE}
+            DEPENDS ${SHADER_SOURCE}
+            VERBATIM)
+        add_custom_command(
+            OUTPUT ${SHADER_METALLIB}
+            COMMAND xcrun -sdk macosx metallib -o ${SHADER_METALLIB} ${SHADER_IR}
+            DEPENDS ${SHADER_IR}
+            VERBATIM)
+        set(RENDERER_MTL_HOST_SHADERS_SOURCES ${RENDERER_MTL_HOST_SHADERS_SOURCES} ${SHADER_METALLIB})
+    endfunction()
+
+    add_metal_shader(metal_shaders)
+    add_metal_shader(metal_copy_to_lut_texture)
+
+    add_custom_target(
+        compile_msl_shaders
+        DEPENDS ${RENDERER_MTL_HOST_SHADERS_SOURCES}
+    )
+
+    cmrc_add_resource_library(
+        resources_renderer_mtl
+        NAMESPACE RendererMTL
+        WHENCE "src/host_shaders/"
+        "src/host_shaders/metal_shaders.metallib"
+        "src/host_shaders/metal_copy_to_lut_texture.metallib"
+    )
+    add_dependencies(resources_renderer_mtl compile_msl_shaders)
+
+    target_sources(AlberCore PRIVATE ${RENDERER_MTL_SOURCE_FILES})
+    target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_METAL=1")
+    target_include_directories(AlberCore PRIVATE third_party/metal-cpp)
+    # TODO: check if all of them are needed
+    target_link_libraries(AlberCore PRIVATE "-framework Metal" "-framework Foundation" "-framework QuartzCore" resources_renderer_mtl)
+endif()
+
 source_group("Header Files\\Core" FILES ${HEADER_FILES})
-set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES} 
+set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERNEL_SOURCE_FILES}
     ${LOADER_SOURCE_FILES} ${SERVICE_SOURCE_FILES} ${APPLET_SOURCE_FILES} ${RENDERER_SW_SOURCE_FILES} ${PICA_SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES}
     ${AUDIO_SOURCE_FILES} ${HEADER_FILES} ${FRONTEND_HEADER_FILES})
 target_sources(AlberCore PRIVATE ${ALL_SOURCES})
@@ -508,7 +570,7 @@ if(NOT BUILD_HYDRA_CORE AND NOT BUILD_LIBRETRO_CORE)
         )
     else()
         set(FRONTEND_SOURCE_FILES src/panda_sdl/main.cpp src/panda_sdl/frontend_sdl.cpp src/panda_sdl/mappings.cpp)
-        set(FRONTEND_HEADER_FILES "include/panda_sdl/frontend_sdl.hpp")
+        set(FRONTEND_HEADER_FILES "")
     endif()
 
     target_link_libraries(Alber PRIVATE AlberCore)
diff --git a/include/renderer_mtl/mtl_blit_pipeline_cache.hpp b/include/renderer_mtl/mtl_blit_pipeline_cache.hpp
new file mode 100644
index 00000000..26422635
--- /dev/null
+++ b/include/renderer_mtl/mtl_blit_pipeline_cache.hpp
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <map>
+
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+
+struct BlitPipelineHash {
+    // Formats
+    ColorFmt colorFmt;
+    DepthFmt depthFmt;
+};
+
+// This pipeline only caches the pipeline with all of its color and depth attachment variations
+class BlitPipelineCache {
+public:
+    BlitPipelineCache() = default;
+
+    ~BlitPipelineCache() {
+        reset();
+        vertexFunction->release();
+        fragmentFunction->release();
+    }
+
+    void set(MTL::Device* dev, MTL::Function* vert, MTL::Function* frag) {
+        device = dev;
+        vertexFunction = vert;
+        fragmentFunction = frag;
+    }
+
+    MTL::RenderPipelineState* get(BlitPipelineHash hash) {
+        u8 intHash = ((u8)hash.colorFmt << 3) | (u8)hash.depthFmt;
+        auto& pipeline = pipelineCache[intHash];
+        if (!pipeline) {
+            MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
+            desc->setVertexFunction(vertexFunction);
+            desc->setFragmentFunction(fragmentFunction);
+
+            auto colorAttachment = desc->colorAttachments()->object(0);
+            colorAttachment->setPixelFormat(toMTLPixelFormatColor(hash.colorFmt));
+
+            desc->setDepthAttachmentPixelFormat(toMTLPixelFormatDepth(hash.depthFmt));
+
+           	NS::Error* error = nullptr;
+            desc->setLabel(toNSString("Blit pipeline"));
+           	pipeline = device->newRenderPipelineState(desc, &error);
+           	if (error) {
+          		Helpers::panic("Error creating blit pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+           	}
+
+            desc->release();
+        }
+
+        return pipeline;
+    }
+
+    void reset() {
+        for (auto& pair : pipelineCache) {
+            pair.second->release();
+        }
+        pipelineCache.clear();
+    }
+
+private:
+    std::map<u8, MTL::RenderPipelineState*> pipelineCache;
+
+    MTL::Device* device;
+    MTL::Function* vertexFunction;
+    MTL::Function* fragmentFunction;
+};
+
+} // namespace Metal
diff --git a/include/renderer_mtl/mtl_depth_stencil_cache.hpp b/include/renderer_mtl/mtl_depth_stencil_cache.hpp
new file mode 100644
index 00000000..90721b70
--- /dev/null
+++ b/include/renderer_mtl/mtl_depth_stencil_cache.hpp
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <map>
+
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+
+struct DepthStencilHash {
+    bool depthStencilWrite;
+    u8 depthFunc;
+    u32 stencilConfig;
+    u16 stencilOpConfig;
+};
+
+class DepthStencilCache {
+public:
+    DepthStencilCache() = default;
+
+    ~DepthStencilCache() {
+        reset();
+    }
+
+    void set(MTL::Device* dev) {
+        device = dev;
+    }
+
+    MTL::DepthStencilState* get(DepthStencilHash hash) {
+        u64 intHash = ((u64)hash.depthStencilWrite << 56) | ((u64)hash.depthFunc << 48) | ((u64)hash.stencilConfig << 16) | (u64)hash.stencilOpConfig;
+        auto& depthStencilState = depthStencilCache[intHash];
+        if (!depthStencilState) {
+            MTL::DepthStencilDescriptor* desc = MTL::DepthStencilDescriptor::alloc()->init();
+            desc->setDepthWriteEnabled(hash.depthStencilWrite);
+            desc->setDepthCompareFunction(toMTLCompareFunc(hash.depthFunc));
+
+            const bool stencilEnable = Helpers::getBit<0>(hash.stencilConfig);
+            MTL::StencilDescriptor* stencilDesc = nullptr;
+            if (stencilEnable) {
+               	const u8 stencilFunc = Helpers::getBits<4, 3>(hash.stencilConfig);
+               	const u8 stencilRefMask = Helpers::getBits<24, 8>(hash.stencilConfig);
+
+               	const u32 stencilBufferMask = hash.depthStencilWrite ? Helpers::getBits<8, 8>(hash.stencilConfig) : 0;
+
+               	const u8 stencilFailOp = Helpers::getBits<0, 3>(hash.stencilOpConfig);
+               	const u8 depthFailOp = Helpers::getBits<4, 3>(hash.stencilOpConfig);
+               	const u8 passOp = Helpers::getBits<8, 3>(hash.stencilOpConfig);
+
+                stencilDesc = MTL::StencilDescriptor::alloc()->init();
+                stencilDesc->setStencilFailureOperation(toMTLStencilOperation(stencilFailOp));
+                stencilDesc->setDepthFailureOperation(toMTLStencilOperation(depthFailOp));
+                stencilDesc->setDepthStencilPassOperation(toMTLStencilOperation(passOp));
+                stencilDesc->setStencilCompareFunction(toMTLCompareFunc(stencilFunc));
+                stencilDesc->setReadMask(stencilRefMask);
+                stencilDesc->setWriteMask(stencilBufferMask);
+
+                desc->setFrontFaceStencil(stencilDesc);
+                desc->setBackFaceStencil(stencilDesc);
+            }
+
+            depthStencilState = device->newDepthStencilState(desc);
+
+            desc->release();
+            if (stencilDesc) {
+                stencilDesc->release();
+            }
+        }
+
+        return depthStencilState;
+    }
+
+    void reset() {
+        for (auto& pair : depthStencilCache) {
+            pair.second->release();
+        }
+        depthStencilCache.clear();
+    }
+
+private:
+    std::map<u64, MTL::DepthStencilState*> depthStencilCache;
+
+    MTL::Device* device;
+};
+
+} // namespace Metal
diff --git a/include/renderer_mtl/mtl_draw_pipeline_cache.hpp b/include/renderer_mtl/mtl_draw_pipeline_cache.hpp
new file mode 100644
index 00000000..8bfea636
--- /dev/null
+++ b/include/renderer_mtl/mtl_draw_pipeline_cache.hpp
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <map>
+
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+
+struct DrawFragmentFunctionHash {
+    bool lightingEnabled; // 1 bit
+    u8 lightingNumLights; // 3 bits
+    u32 lightingConfig1; // 32 bits (TODO: check this)
+    //                                 |   ref    | func |  on  |
+    u16 alphaControl; // 12 bits (mask:  11111111   0111   0001)
+};
+
+//bool operator==(const DrawFragmentFunctionHash& l, const DrawFragmentFunctionHash& r) {
+//    return ((l.lightingEnabled == r.lightingEnabled) && (l.lightingNumLights == r.lightingNumLights) &&
+//            (l.lightingConfig1 == r.lightingConfig1) && (l.alphaControl == r.alphaControl));
+//}
+
+inline bool operator<(const DrawFragmentFunctionHash& l, const DrawFragmentFunctionHash& r) {
+    if (!l.lightingEnabled && r.lightingEnabled) return true;
+    if (l.lightingNumLights < r.lightingNumLights) return true;
+    if (l.lightingConfig1 < r.lightingConfig1) return true;
+    if (l.alphaControl < r.alphaControl) return true;
+
+    return false;
+}
+
+struct DrawPipelineHash { // 56 bits
+    // Formats
+    ColorFmt colorFmt; // 3 bits
+    DepthFmt depthFmt; // 3 bits
+
+    // Blending
+    bool blendEnabled; // 1 bit
+    //                                 |    functions     |   aeq    |   ceq    |
+    u32 blendControl; // 22 bits (mask:  1111111111111111   00000111   00000111)
+    u8 colorWriteMask; // 4 bits
+
+    DrawFragmentFunctionHash fragHash;
+};
+
+//bool operator==(const DrawPipelineHash& l, const DrawPipelineHash& r) {
+//    return (((u32)l.colorFmt == (u32)r.colorFmt) && ((u32)l.depthFmt == (u32)r.depthFmt) &&
+//            (l.blendEnabled == r.blendEnabled) && (l.blendControl == r.blendControl) &&
+//            (l.colorWriteMask == r.colorWriteMask) && (l.fragHash == r.fragHash));
+//}
+
+inline bool operator<(const DrawPipelineHash& l, const DrawPipelineHash& r) {
+    if ((u32)l.colorFmt < (u32)r.colorFmt) return true;
+    if ((u32)l.depthFmt < (u32)r.depthFmt) return true;
+    if (!l.blendEnabled && r.blendEnabled) return true;
+    if (l.blendControl < r.blendControl) return true;
+    if (l.colorWriteMask < r.colorWriteMask) return true;
+    if (l.fragHash < r.fragHash) return true;
+
+    return false;
+}
+
+// Bind the vertex buffer to binding 30 so that it doesn't occupy the lower indices
+#define VERTEX_BUFFER_BINDING_INDEX 30
+
+// This pipeline only caches the pipeline with all of its color and depth attachment variations
+class DrawPipelineCache {
+public:
+    DrawPipelineCache() = default;
+
+    ~DrawPipelineCache() {
+        reset();
+        vertexDescriptor->release();
+        vertexFunction->release();
+    }
+
+    void set(MTL::Device* dev, MTL::Library* lib, MTL::Function* vert, MTL::VertexDescriptor* vertDesc) {
+        device = dev;
+        library = lib;
+        vertexFunction = vert;
+        vertexDescriptor = vertDesc;
+    }
+
+    MTL::RenderPipelineState* get(DrawPipelineHash hash) {
+        //u32 fragmentFunctionHash = ((u32)hash.lightingEnabled << 22) | ((u32)hash.lightingNumLights << 19) | ((u32)hash.lightingConfig1 << 12) | ((((u32)hash.alphaControl & 0b1111111100000000) >> 8) << 4) | ((((u32)hash.alphaControl & 0b01110000) >> 4) << 1) | ((u32)hash.alphaControl & 0b0001);
+        //u64 pipelineHash = ((u64)hash.colorFmt << 53) | ((u64)hash.depthFmt << 50) | ((u64)hash.blendEnabled << 49) | ((u64)hash.colorWriteMask << 45) | ((((u64)hash.blendControl & 0b11111111111111110000000000000000) >> 16) << 29) | ((((u64)hash.blendControl & 0b0000011100000000) >> 8) << 26) | (((u64)hash.blendControl & 0b00000111) << 23) | fragmentFunctionHash;
+        auto& pipeline = pipelineCache[hash];
+        if (!pipeline) {
+            auto& fragmentFunction = fragmentFunctionCache[hash.fragHash];
+            if (!fragmentFunction) {
+                MTL::FunctionConstantValues* constants = MTL::FunctionConstantValues::alloc()->init();
+                constants->setConstantValue(&hash.fragHash.lightingEnabled, MTL::DataTypeBool, NS::UInteger(0));
+                constants->setConstantValue(&hash.fragHash.lightingNumLights, MTL::DataTypeUChar, NS::UInteger(1));
+                constants->setConstantValue(&hash.fragHash.lightingConfig1, MTL::DataTypeUInt, NS::UInteger(2));
+                constants->setConstantValue(&hash.fragHash.alphaControl, MTL::DataTypeUShort, NS::UInteger(3));
+
+                NS::Error* error = nullptr;
+                fragmentFunction = library->newFunction(NS::String::string("fragmentDraw", NS::ASCIIStringEncoding), constants, &error);
+                if (error) {
+                    Helpers::panic("Error creating draw fragment function: %s", error->description()->cString(NS::ASCIIStringEncoding));
+                }
+                constants->release();
+            }
+
+            MTL::RenderPipelineDescriptor* desc = MTL::RenderPipelineDescriptor::alloc()->init();
+            desc->setVertexFunction(vertexFunction);
+            desc->setFragmentFunction(fragmentFunction);
+            desc->setVertexDescriptor(vertexDescriptor);
+
+            auto colorAttachment = desc->colorAttachments()->object(0);
+            colorAttachment->setPixelFormat(toMTLPixelFormatColor(hash.colorFmt));
+            MTL::ColorWriteMask writeMask = 0;
+            if (hash.colorWriteMask & 0x1) writeMask |= MTL::ColorWriteMaskRed;
+            if (hash.colorWriteMask & 0x2) writeMask |= MTL::ColorWriteMaskGreen;
+            if (hash.colorWriteMask & 0x4) writeMask |= MTL::ColorWriteMaskBlue;
+            if (hash.colorWriteMask & 0x8) writeMask |= MTL::ColorWriteMaskAlpha;
+            colorAttachment->setWriteMask(writeMask);
+            if (hash.blendEnabled) {
+                const u8 rgbEquation = hash.blendControl & 0x7;
+               	const u8 alphaEquation = Helpers::getBits<8, 3>(hash.blendControl);
+
+               	// Get blending functions
+               	const u8 rgbSourceFunc = Helpers::getBits<16, 4>(hash.blendControl);
+               	const u8 rgbDestFunc = Helpers::getBits<20, 4>(hash.blendControl);
+               	const u8 alphaSourceFunc = Helpers::getBits<24, 4>(hash.blendControl);
+               	const u8 alphaDestFunc = Helpers::getBits<28, 4>(hash.blendControl);
+
+                colorAttachment->setBlendingEnabled(true);
+                colorAttachment->setRgbBlendOperation(toMTLBlendOperation(rgbEquation));
+                colorAttachment->setAlphaBlendOperation(toMTLBlendOperation(alphaEquation));
+               	colorAttachment->setSourceRGBBlendFactor(toMTLBlendFactor(rgbSourceFunc));
+               	colorAttachment->setDestinationRGBBlendFactor(toMTLBlendFactor(rgbDestFunc));
+               	colorAttachment->setSourceAlphaBlendFactor(toMTLBlendFactor(alphaSourceFunc));
+               	colorAttachment->setDestinationAlphaBlendFactor(toMTLBlendFactor(alphaDestFunc));
+            }
+
+            desc->setDepthAttachmentPixelFormat(toMTLPixelFormatDepth(hash.depthFmt));
+
+           	NS::Error* error = nullptr;
+            desc->setLabel(toNSString("Draw pipeline"));
+           	pipeline = device->newRenderPipelineState(desc, &error);
+           	if (error) {
+          		Helpers::panic("Error creating draw pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+           	}
+
+            desc->release();
+        }
+
+        return pipeline;
+    }
+
+    void reset() {
+        for (auto& pair : pipelineCache) {
+            pair.second->release();
+        }
+        pipelineCache.clear();
+        for (auto& pair : fragmentFunctionCache) {
+            pair.second->release();
+        }
+        fragmentFunctionCache.clear();
+    }
+
+private:
+    std::map<DrawPipelineHash, MTL::RenderPipelineState*> pipelineCache;
+    std::map<DrawFragmentFunctionHash, MTL::Function*> fragmentFunctionCache;
+
+    MTL::Device* device;
+    MTL::Library* library;
+    MTL::Function* vertexFunction;
+    MTL::VertexDescriptor* vertexDescriptor;
+};
+
+} // namespace Metal
diff --git a/include/renderer_mtl/mtl_render_target.hpp b/include/renderer_mtl/mtl_render_target.hpp
new file mode 100644
index 00000000..73be45f4
--- /dev/null
+++ b/include/renderer_mtl/mtl_render_target.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#include <array>
+#include <string>
+#include <Metal/Metal.hpp>
+#include "boost/icl/interval.hpp"
+#include "helpers.hpp"
+#include "math_util.hpp"
+#include "opengl.hpp"
+#include "pica_to_mtl.hpp"
+#include "objc_helper.hpp"
+
+template <typename T>
+using Interval = boost::icl::right_open_interval<T>;
+
+namespace Metal {
+
+template <typename Format_t>
+struct RenderTarget {
+    MTL::Device* device;
+
+    u32 location;
+    Format_t format;
+    OpenGL::uvec2 size;
+    bool valid;
+
+    // Range of VRAM taken up by buffer
+    Interval<u32> range;
+
+    MTL::Texture* texture = nullptr;
+
+    RenderTarget() : valid(false) {}
+
+    RenderTarget(MTL::Device* dev, u32 loc, Format_t format, u32 x, u32 y, bool valid = true)
+        : device(dev), location(loc), format(format), size({x, y}), valid(valid) {
+        u64 endLoc = (u64)loc + sizeInBytes();
+        // Check if start and end are valid here
+        range = Interval<u32>(loc, (u32)endLoc);
+    }
+
+	Math::Rect<u32> getSubRect(u32 inputAddress, u32 width, u32 height) {
+		const u32 startOffset = (inputAddress - location) / sizePerPixel(format);
+		const u32 x0 = (startOffset % (size.x() * 8)) / 8;
+		const u32 y0 = (startOffset / (size.x() * 8)) * 8;
+		return Math::Rect<u32>{x0, size.y() - y0, x0 + width, size.y() - height - y0};
+	}
+
+    // For 2 textures to "match" we only care about their locations, formats, and dimensions to match
+    // For other things, such as filtering mode, etc, we can just switch the attributes of the cached texture
+    bool matches(RenderTarget& other) {
+        return location == other.location && format == other.format &&
+            size.x() == other.size.x() && size.y() == other.size.y();
+    }
+
+    void allocate() {
+        MTL::PixelFormat pixelFormat = MTL::PixelFormatInvalid;
+        if (std::is_same<Format_t, PICA::ColorFmt>::value) {
+            pixelFormat = PICA::toMTLPixelFormatColor((PICA::ColorFmt)format);
+        } else if (std::is_same<Format_t, PICA::DepthFmt>::value) {
+            pixelFormat = PICA::toMTLPixelFormatDepth((PICA::DepthFmt)format);
+        } else {
+            panic("Invalid format type");
+        }
+
+        MTL::TextureDescriptor* descriptor = MTL::TextureDescriptor::alloc()->init();
+        descriptor->setTextureType(MTL::TextureType2D);
+        descriptor->setPixelFormat(pixelFormat);
+        descriptor->setWidth(size.u());
+        descriptor->setHeight(size.v());
+        descriptor->setUsage(MTL::TextureUsageRenderTarget | MTL::TextureUsageShaderRead);
+        descriptor->setStorageMode(MTL::StorageModePrivate);
+        texture = device->newTexture(descriptor);
+        texture->setLabel(toNSString(std::string(std::is_same<Format_t, PICA::ColorFmt>::value ? "Color" : "Depth") + " render target " + std::to_string(size.u()) + "x" + std::to_string(size.v())));
+        descriptor->release();
+    }
+
+    void free() {
+        valid = false;
+
+    	if (texture) {
+    		texture->release();
+    	}
+    }
+
+    u64 sizeInBytes() {
+        return (size_t)size.x() * (size_t)size.y() * PICA::sizePerPixel(format);
+    }
+};
+
+typedef RenderTarget<PICA::ColorFmt> ColorRenderTarget;
+typedef RenderTarget<PICA::DepthFmt> DepthStencilRenderTarget;
+
+} // namespace Metal
diff --git a/include/renderer_mtl/mtl_texture.hpp b/include/renderer_mtl/mtl_texture.hpp
new file mode 100644
index 00000000..590132bd
--- /dev/null
+++ b/include/renderer_mtl/mtl_texture.hpp
@@ -0,0 +1,77 @@
+#pragma once
+#include <array>
+#include <string>
+#include <Metal/Metal.hpp>
+#include "PICA/regs.hpp"
+#include "boost/icl/interval.hpp"
+#include "helpers.hpp"
+#include "math_util.hpp"
+#include "opengl.hpp"
+#include "renderer_mtl/pica_to_mtl.hpp"
+
+template <typename T>
+using Interval = boost::icl::right_open_interval<T>;
+
+namespace Metal {
+
+struct Texture {
+    MTL::Device* device;
+
+    u32 location;
+    u32 config; // Magnification/minification filter, wrapping configs, etc
+    PICA::TextureFmt format;
+    OpenGL::uvec2 size;
+    bool valid;
+
+    // Range of VRAM taken up by buffer
+    Interval<u32> range;
+
+    PICA::PixelFormatInfo formatInfo;
+    MTL::Texture* texture = nullptr;
+    MTL::SamplerState* sampler = nullptr;
+
+    Texture() : valid(false) {}
+
+    Texture(MTL::Device* dev, u32 loc, PICA::TextureFmt format, u32 x, u32 y, u32 config, bool valid = true)
+        : device(dev), location(loc), format(format), size({x, y}), config(config), valid(valid) {
+
+        u64 endLoc = (u64)loc + sizeInBytes();
+        // Check if start and end are valid here
+        range = Interval<u32>(loc, (u32)endLoc);
+    }
+
+    // For 2 textures to "match" we only care about their locations, formats, and dimensions to match
+    // For other things, such as filtering mode, etc, we can just switch the attributes of the cached texture
+    bool matches(Texture& other) {
+        return location == other.location && format == other.format &&
+            size.x() == other.size.x() && size.y() == other.size.y();
+    }
+
+    void allocate();
+    void setNewConfig(u32 newConfig);
+    void decodeTexture(std::span<const u8> data);
+    void free();
+    u64 sizeInBytes();
+
+    u8 decodeTexelU8(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+    u16 decodeTexelU16(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+    u32 decodeTexelU32(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data);
+
+    // Get the morton interleave offset of a texel based on its U and V values
+    static u32 mortonInterleave(u32 u, u32 v);
+    // Get the byte offset of texel (u, v) in the texture
+    static u32 getSwizzledOffset(u32 u, u32 v, u32 width, u32 bytesPerPixel);
+    static u32 getSwizzledOffset_4bpp(u32 u, u32 v, u32 width);
+
+    // Returns the format of this texture as a string
+    std::string_view formatToString() {
+        return PICA::textureFormatToString(format);
+    }
+
+    // Returns the texel at coordinates (u, v) of an ETC1(A4) texture
+    // TODO: Make hasAlpha a template parameter
+    u32 getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span<const u8> data);
+    u32 decodeETC(u32 alpha, u32 u, u32 v, u64 colourData);
+};
+
+} // namespace Metal
diff --git a/include/renderer_mtl/mtl_vertex_buffer_cache.hpp b/include/renderer_mtl/mtl_vertex_buffer_cache.hpp
new file mode 100644
index 00000000..1760cdfa
--- /dev/null
+++ b/include/renderer_mtl/mtl_vertex_buffer_cache.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "pica_to_mtl.hpp"
+
+using namespace PICA;
+
+namespace Metal {
+
+struct BufferHandle {
+    MTL::Buffer* buffer;
+    size_t offset;
+};
+
+// 64MB buffer for caching vertex data
+#define CACHE_BUFFER_SIZE 64 * 1024 * 1024
+
+class VertexBufferCache {
+public:
+    VertexBufferCache() = default;
+
+    ~VertexBufferCache() {
+        endFrame();
+        buffer->release();
+    }
+
+    void set(MTL::Device* dev) {
+        device = dev;
+        create();
+    }
+
+    void endFrame() {
+        ptr = 0;
+        for (auto buffer : additionalAllocations) {
+            buffer->release();
+        }
+        additionalAllocations.clear();
+    }
+
+    BufferHandle get(const void* data, size_t size) {
+        // If the vertex buffer is too large, just create a new one
+        if (ptr + size > CACHE_BUFFER_SIZE) {
+            MTL::Buffer* newBuffer = device->newBuffer(data, size, MTL::ResourceStorageModeShared);
+            newBuffer->setLabel(toNSString("Additional vertex buffer"));
+            additionalAllocations.push_back(newBuffer);
+            Helpers::warn("Vertex buffer doesn't have enough space, creating a new buffer");
+
+            return BufferHandle{newBuffer, 0};
+        }
+
+        // Copy the data into the buffer
+        memcpy((char*)buffer->contents() + ptr, data, size);
+
+        size_t oldPtr = ptr;
+        ptr += size;
+
+        return BufferHandle{buffer, oldPtr};
+    }
+
+    void reset() {
+        endFrame();
+        if (buffer) {
+            buffer->release();
+            create();
+        }
+    }
+
+private:
+    MTL::Buffer* buffer = nullptr;
+    size_t ptr = 0;
+    std::vector<MTL::Buffer*> additionalAllocations;
+
+    MTL::Device* device;
+
+    void create() {
+        buffer = device->newBuffer(CACHE_BUFFER_SIZE, MTL::ResourceStorageModeShared);
+        buffer->setLabel(toNSString("Shared vertex buffer"));
+    }
+};
+
+} // namespace Metal
diff --git a/include/renderer_mtl/objc_helper.hpp b/include/renderer_mtl/objc_helper.hpp
new file mode 100644
index 00000000..91756d24
--- /dev/null
+++ b/include/renderer_mtl/objc_helper.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <string>
+
+#include <Metal/Metal.hpp>
+
+namespace Metal {
+
+dispatch_data_t createDispatchData(const void* data, size_t size);
+
+} // namespace Metal
+
+// Cast from std::string to NS::String*
+inline NS::String* toNSString(const std::string& str) {
+    return NS::String::string(str.c_str(), NS::ASCIIStringEncoding);
+}
diff --git a/include/renderer_mtl/pica_to_mtl.hpp b/include/renderer_mtl/pica_to_mtl.hpp
new file mode 100644
index 00000000..de76dc3b
--- /dev/null
+++ b/include/renderer_mtl/pica_to_mtl.hpp
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <Metal/Metal.hpp>
+#include "PICA/regs.hpp"
+
+namespace PICA {
+
+struct PixelFormatInfo {
+    MTL::PixelFormat pixelFormat;
+    size_t bytesPerTexel;
+};
+
+constexpr PixelFormatInfo pixelFormatInfos[14] = {
+    {MTL::PixelFormatRGBA8Unorm, 4}, // RGBA8
+    {MTL::PixelFormatRGBA8Unorm, 4}, // RGB8
+    {MTL::PixelFormatBGR5A1Unorm, 2}, // RGBA5551
+    {MTL::PixelFormatB5G6R5Unorm, 2}, // RGB565
+    {MTL::PixelFormatABGR4Unorm, 2}, // RGBA4
+    {MTL::PixelFormatRGBA8Unorm, 4}, // IA8
+    {MTL::PixelFormatRG8Unorm, 2}, // RG8
+    {MTL::PixelFormatRGBA8Unorm, 4}, // I8
+    {MTL::PixelFormatA8Unorm, 1}, // A8
+    {MTL::PixelFormatABGR4Unorm, 2}, // IA4
+    {MTL::PixelFormatABGR4Unorm, 2}, // I4
+    {MTL::PixelFormatA8Unorm, 1}, // A4
+    {MTL::PixelFormatRGBA8Unorm, 4}, // ETC1
+    {MTL::PixelFormatRGBA8Unorm, 4}, // ETC1A4
+};
+
+inline PixelFormatInfo getPixelFormatInfo(TextureFmt format) {
+    return pixelFormatInfos[static_cast<int>(format)];
+}
+
+inline MTL::PixelFormat toMTLPixelFormatColor(ColorFmt format) {
+    switch (format) {
+    case ColorFmt::RGBA8: return MTL::PixelFormatRGBA8Unorm;
+    case ColorFmt::RGB8: return MTL::PixelFormatRGBA8Unorm;
+    case ColorFmt::RGBA5551: return MTL::PixelFormatRGBA8Unorm; // TODO: use MTL::PixelFormatBGR5A1Unorm?
+    case ColorFmt::RGB565: return MTL::PixelFormatRGBA8Unorm; // TODO: use MTL::PixelFormatB5G6R5Unorm?
+    case ColorFmt::RGBA4: return MTL::PixelFormatABGR4Unorm;
+    }
+}
+
+inline MTL::PixelFormat toMTLPixelFormatDepth(DepthFmt format) {
+    switch (format) {
+    case DepthFmt::Depth16: return MTL::PixelFormatDepth16Unorm;
+    case DepthFmt::Unknown1: return MTL::PixelFormatInvalid;
+    case DepthFmt::Depth24: return MTL::PixelFormatDepth32Float; // Metal does not support 24-bit depth formats
+    // Apple sillicon doesn't support 24-bit depth buffers, so we use 32-bit instead
+    case DepthFmt::Depth24Stencil8: return MTL::PixelFormatDepth32Float_Stencil8;
+    }
+}
+
+inline MTL::CompareFunction toMTLCompareFunc(u8 func) {
+    switch (func) {
+    case 0: return MTL::CompareFunctionNever;
+    case 1: return MTL::CompareFunctionAlways;
+    case 2: return MTL::CompareFunctionEqual;
+    case 3: return MTL::CompareFunctionNotEqual;
+    case 4: return MTL::CompareFunctionLess;
+    case 5: return MTL::CompareFunctionLessEqual;
+    case 6: return MTL::CompareFunctionGreater;
+    case 7: return MTL::CompareFunctionGreaterEqual;
+    default: panic("Unknown compare function %u", func);
+    }
+
+    return MTL::CompareFunctionAlways;
+}
+
+inline MTL::BlendOperation toMTLBlendOperation(u8 op) {
+    switch (op) {
+    case 0: return MTL::BlendOperationAdd;
+    case 1: return MTL::BlendOperationSubtract;
+    case 2: return MTL::BlendOperationReverseSubtract;
+    case 3: return MTL::BlendOperationMin;
+    case 4: return MTL::BlendOperationMax;
+    case 5: return MTL::BlendOperationAdd; // Unused (same as 0)
+    case 6: return MTL::BlendOperationAdd; // Unused (same as 0)
+    case 7: return MTL::BlendOperationAdd; // Unused (same as 0)
+    default: panic("Unknown blend operation %u", op);
+    }
+
+    return MTL::BlendOperationAdd;
+}
+
+inline MTL::BlendFactor toMTLBlendFactor(u8 factor) {
+    switch (factor) {
+    case 0: return MTL::BlendFactorZero;
+    case 1: return MTL::BlendFactorOne;
+    case 2: return MTL::BlendFactorSourceColor;
+    case 3: return MTL::BlendFactorOneMinusSourceColor;
+    case 4: return MTL::BlendFactorDestinationColor;
+    case 5: return MTL::BlendFactorOneMinusDestinationColor;
+    case 6: return MTL::BlendFactorSourceAlpha;
+    case 7: return MTL::BlendFactorOneMinusSourceAlpha;
+    case 8: return MTL::BlendFactorDestinationAlpha;
+    case 9: return MTL::BlendFactorOneMinusDestinationAlpha;
+    case 10: return MTL::BlendFactorBlendColor;
+    case 11: return MTL::BlendFactorOneMinusBlendColor;
+    case 12: return MTL::BlendFactorBlendAlpha;
+    case 13: return MTL::BlendFactorOneMinusBlendAlpha;
+    case 14: return MTL::BlendFactorSourceAlphaSaturated;
+    case 15: return MTL::BlendFactorOne; // Undocumented
+    default: panic("Unknown blend factor %u", factor);
+    }
+
+    return MTL::BlendFactorOne;
+}
+
+inline MTL::StencilOperation toMTLStencilOperation(u8 op) {
+    switch (op) {
+    case 0: return MTL::StencilOperationKeep;
+    case 1: return MTL::StencilOperationZero;
+    case 2: return MTL::StencilOperationReplace;
+    case 3: return MTL::StencilOperationIncrementClamp;
+    case 4: return MTL::StencilOperationDecrementClamp;
+    case 5: return MTL::StencilOperationInvert;
+    case 6: return MTL::StencilOperationIncrementWrap;
+    case 7: return MTL::StencilOperationDecrementWrap;
+    default: panic("Unknown stencil operation %u", op);
+    }
+
+    return MTL::StencilOperationKeep;
+}
+
+inline MTL::PrimitiveType toMTLPrimitiveType(PrimType primType) {
+    switch (primType) {
+    case PrimType::TriangleList: return MTL::PrimitiveTypeTriangle;
+    case PrimType::TriangleStrip: return MTL::PrimitiveTypeTriangleStrip;
+    case PrimType::TriangleFan:
+        Helpers::warn("Triangle fans are not supported on Metal, using triangles instead");
+        return MTL::PrimitiveTypeTriangle;
+    case PrimType::GeometryPrimitive:
+        //Helpers::warn("Geometry primitives are not yet, using triangles instead");
+        return MTL::PrimitiveTypeTriangle;
+    }
+}
+
+inline MTL::SamplerAddressMode toMTLSamplerAddressMode(u8 addrMode) {
+    switch (addrMode) {
+    case 0: return MTL::SamplerAddressModeClampToEdge;
+    case 1: return MTL::SamplerAddressModeClampToBorderColor;
+    case 2: return MTL::SamplerAddressModeRepeat;
+    case 3: return MTL::SamplerAddressModeMirrorRepeat;
+    case 4: return MTL::SamplerAddressModeClampToEdge;
+    case 5: return MTL::SamplerAddressModeClampToBorderColor;
+    case 6: return MTL::SamplerAddressModeRepeat;
+    case 7: return MTL::SamplerAddressModeRepeat;
+    default: panic("Unknown sampler address mode %u", addrMode);
+    }
+
+    return MTL::SamplerAddressModeClampToEdge;
+}
+
+} // namespace PICA
diff --git a/include/renderer_mtl/renderer_mtl.hpp b/include/renderer_mtl/renderer_mtl.hpp
new file mode 100644
index 00000000..9ba0937a
--- /dev/null
+++ b/include/renderer_mtl/renderer_mtl.hpp
@@ -0,0 +1,189 @@
+#include <Metal/Metal.hpp>
+#include <QuartzCore/QuartzCore.hpp>
+
+#include "renderer.hpp"
+#include "mtl_texture.hpp"
+#include "mtl_render_target.hpp"
+#include "mtl_blit_pipeline_cache.hpp"
+#include "mtl_draw_pipeline_cache.hpp"
+#include "mtl_depth_stencil_cache.hpp"
+#include "mtl_vertex_buffer_cache.hpp"
+// HACK: use the OpenGL cache
+#include "../renderer_gl/surface_cache.hpp"
+
+class GPU;
+
+struct Color4 {
+    float r, g, b, a;
+};
+
+class RendererMTL final : public Renderer {
+  public:
+	RendererMTL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs);
+	~RendererMTL() override;
+
+	void reset() override;
+	void display() override;
+	void initGraphicsContext(SDL_Window* window) override;
+	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
+	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
+	void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
+	void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
+	void screenshot(const std::string& name) override;
+	void deinitGraphicsContext() override;
+
+#ifdef PANDA3DS_FRONTEND_QT
+	virtual void initGraphicsContext([[maybe_unused]] GL::Context* context) override {}
+#endif
+
+  private:
+	CA::MetalLayer* metalLayer;
+
+	MTL::Device* device;
+	MTL::CommandQueue* commandQueue;
+
+	// Libraries
+	MTL::Library* library;
+
+	// Caches
+	SurfaceCache<Metal::ColorRenderTarget, 16, true> colorRenderTargetCache;
+	SurfaceCache<Metal::DepthStencilRenderTarget, 16, true> depthStencilRenderTargetCache;
+	SurfaceCache<Metal::Texture, 256, true> textureCache;
+	Metal::BlitPipelineCache blitPipelineCache;
+	Metal::DrawPipelineCache drawPipelineCache;
+	Metal::DepthStencilCache depthStencilCache;
+	Metal::VertexBufferCache vertexBufferCache;
+
+	// Objects
+	MTL::SamplerState* nearestSampler;
+	MTL::SamplerState* linearSampler;
+	MTL::Texture* lutTexture;
+	MTL::DepthStencilState* defaultDepthStencilState;
+
+	// Pipelines
+	MTL::RenderPipelineState* displayPipeline;
+	MTL::RenderPipelineState* copyToLutTexturePipeline;
+
+	// Clears
+	std::map<MTL::Texture*, Color4> colorClearOps;
+	std::map<MTL::Texture*, float> depthClearOps;
+	std::map<MTL::Texture*, u8> stencilClearOps;
+
+	// Active state
+	MTL::CommandBuffer* commandBuffer = nullptr;
+	MTL::RenderCommandEncoder* renderCommandEncoder = nullptr;
+	MTL::Texture* lastColorTexture = nullptr;
+	MTL::Texture* lastDepthTexture = nullptr;
+
+	// Debug
+	std::string nextRenderPassName;
+
+	void createCommandBufferIfNeeded() {
+		if (!commandBuffer) {
+			commandBuffer = commandQueue->commandBuffer();
+		}
+	}
+
+	void endRenderPass() {
+        if (renderCommandEncoder) {
+            renderCommandEncoder->endEncoding();
+            renderCommandEncoder = nullptr;
+        }
+	}
+
+	void beginRenderPassIfNeeded(MTL::RenderPassDescriptor* renderPassDescriptor, bool doesClears, MTL::Texture* colorTexture, MTL::Texture* depthTexture = nullptr) {
+		createCommandBufferIfNeeded();
+
+		if (doesClears || !renderCommandEncoder || colorTexture != lastColorTexture || (depthTexture != lastDepthTexture && !(lastDepthTexture && !depthTexture))) {
+		    endRenderPass();
+
+            renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor);
+            renderCommandEncoder->setLabel(toNSString(nextRenderPassName));
+
+		    lastColorTexture = colorTexture;
+            lastDepthTexture = depthTexture;
+		}
+
+		renderPassDescriptor->release();
+	}
+
+	void commitCommandBuffer() {
+	   if (renderCommandEncoder) {
+            renderCommandEncoder->endEncoding();
+            renderCommandEncoder->release();
+            renderCommandEncoder = nullptr;
+        }
+        if (commandBuffer) {
+            commandBuffer->commit();
+            commandBuffer->release();
+            commandBuffer = nullptr;
+        }
+    }
+
+    template<typename AttachmentT, typename ClearDataT, typename GetAttachmentT, typename SetClearDataT>
+    inline void clearAttachment(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture, ClearDataT clearData, GetAttachmentT getAttachment, SetClearDataT setClearData) {
+        bool beginRenderPass = (renderPassDescriptor == nullptr);
+        if (!renderPassDescriptor) {
+            renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
+        }
+
+        AttachmentT* attachment = getAttachment(renderPassDescriptor);
+		attachment->setTexture(texture);
+		setClearData(attachment, clearData);
+		attachment->setLoadAction(MTL::LoadActionClear);
+		attachment->setStoreAction(MTL::StoreActionStore);
+
+		if (beginRenderPass) {
+		    if (std::is_same<AttachmentT, MTL::RenderPassColorAttachmentDescriptor>::value)
+                beginRenderPassIfNeeded(renderPassDescriptor, true, texture);
+            else
+                beginRenderPassIfNeeded(renderPassDescriptor, true, nullptr, texture);
+		}
+    }
+
+    template<typename AttachmentT, typename ClearDataT, typename GetAttachmentT, typename SetClearDataT>
+    inline bool clearAttachment(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture, std::map<MTL::Texture*, ClearDataT>& clearOps, GetAttachmentT getAttachment, SetClearDataT setClearData) {
+        auto it = clearOps.find(texture);
+        if (it != clearOps.end()) {
+            clearAttachment<AttachmentT>(renderPassDescriptor, texture, it->second, getAttachment, setClearData);
+            clearOps.erase(it);
+            return true;
+        }
+
+        if (renderPassDescriptor) {
+            AttachmentT* attachment = getAttachment(renderPassDescriptor);
+    		attachment->setTexture(texture);
+    		attachment->setLoadAction(MTL::LoadActionLoad);
+    		attachment->setStoreAction(MTL::StoreActionStore);
+        }
+
+        return false;
+    }
+
+    bool clearColor(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+        return clearAttachment<MTL::RenderPassColorAttachmentDescriptor, Color4>(renderPassDescriptor, texture, colorClearOps, [](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->colorAttachments()->object(0); }, [](auto attachment, auto& color) {
+            attachment->setClearColor(MTL::ClearColor(color.r, color.g, color.b, color.a));
+        });
+    }
+
+    bool clearDepth(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+        return clearAttachment<MTL::RenderPassDepthAttachmentDescriptor, float>(renderPassDescriptor, texture, depthClearOps, [](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->depthAttachment(); }, [](auto attachment, auto& depth) {
+            attachment->setClearDepth(depth);
+        });
+    }
+
+    bool clearStencil(MTL::RenderPassDescriptor* renderPassDescriptor, MTL::Texture* texture) {
+        return clearAttachment<MTL::RenderPassStencilAttachmentDescriptor, u8>(renderPassDescriptor, texture, stencilClearOps, [](MTL::RenderPassDescriptor* renderPassDescriptor) { return renderPassDescriptor->stencilAttachment(); }, [](auto attachment, auto& stencil) {
+            attachment->setClearStencil(stencil);
+        });
+    }
+
+	std::optional<Metal::ColorRenderTarget> getColorRenderTarget(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
+	Metal::DepthStencilRenderTarget& getDepthRenderTarget();
+	Metal::Texture& getTexture(Metal::Texture& tex);
+	void setupTextureEnvState(MTL::RenderCommandEncoder* encoder);
+	void bindTexturesToSlots(MTL::RenderCommandEncoder* encoder);
+	void updateLightingLUT(MTL::RenderCommandEncoder* encoder);
+	void updateFogLUT(MTL::RenderCommandEncoder* encoder);
+	void textureCopyImpl(Metal::ColorRenderTarget& srcFramebuffer, Metal::ColorRenderTarget& destFramebuffer, const Math::Rect<u32>& srcRect, const Math::Rect<u32>& destRect);
+};
diff --git a/src/core/renderer_mtl/metal_cpp_impl.cpp b/src/core/renderer_mtl/metal_cpp_impl.cpp
new file mode 100644
index 00000000..7fa7137b
--- /dev/null
+++ b/src/core/renderer_mtl/metal_cpp_impl.cpp
@@ -0,0 +1,6 @@
+#define NS_PRIVATE_IMPLEMENTATION
+#define CA_PRIVATE_IMPLEMENTATION
+#define MTL_PRIVATE_IMPLEMENTATION
+#include <Foundation/Foundation.hpp>
+#include <Metal/Metal.hpp>
+#include <QuartzCore/QuartzCore.hpp>
diff --git a/src/core/renderer_mtl/mtl_etc1.cpp b/src/core/renderer_mtl/mtl_etc1.cpp
new file mode 100644
index 00000000..a414df3c
--- /dev/null
+++ b/src/core/renderer_mtl/mtl_etc1.cpp
@@ -0,0 +1,124 @@
+#include <algorithm>
+#include "colour.hpp"
+#include "renderer_mtl/renderer_mtl.hpp"
+#include "renderer_mtl/mtl_texture.hpp"
+
+using namespace Helpers;
+
+namespace Metal {
+
+static constexpr u32 signExtend3To32(u32 val) {
+    return (u32)(s32(val) << 29 >> 29);
+}
+
+u32 Texture::getTexelETC(bool hasAlpha, u32 u, u32 v, u32 width, std::span<const u8> data) {
+    // Pixel offset of the 8x8 tile based on u, v and the width of the texture
+    u32 offs = ((u & ~7) * 8) + ((v & ~7) * width);
+    if (!hasAlpha)
+        offs >>= 1;
+
+    // In-tile offsets for u/v
+    u &= 7;
+    v &= 7;
+
+    // ETC1(A4) also subdivide the 8x8 tile to 4 4x4 tiles
+    // Each tile is 8 bytes for ETC1, but since ETC1A4 has 4 alpha bits per pixel, that becomes 16 bytes
+    const u32 subTileSize = hasAlpha ? 16 : 8;
+    const u32 subTileIndex = (u / 4) + 2 * (v / 4); // Which of the 4 subtiles is this texel in?
+
+    // In-subtile offsets for u/v
+    u &= 3;
+    v &= 3;
+    offs += subTileSize * subTileIndex;
+
+    u32 alpha;
+    const u64* ptr = reinterpret_cast<const u64*>(data.data() + offs); // Cast to u64*
+
+    if (hasAlpha) {
+        // First 64 bits of the 4x4 subtile are alpha data
+        const u64 alphaData = *ptr++;
+        alpha = Colour::convert4To8Bit((alphaData >> (4 * (u * 4 + v))) & 0xf);
+    }
+    else {
+        alpha = 0xff; // ETC1 without alpha uses ff for every pixel
+    }
+
+    // Next 64 bits of the subtile are colour data
+    u64 colourData = *ptr;
+    return decodeETC(alpha, u, v, colourData);
+}
+
+u32 Texture::decodeETC(u32 alpha, u32 u, u32 v, u64 colourData) {
+    static constexpr u32 modifiers[8][2] = {
+        { 2, 8 },
+        { 5, 17 },
+        { 9, 29 },
+        { 13, 42 },
+        { 18, 60 },
+        { 24, 80 },
+        { 33, 106 },
+        { 47, 183 },
+    };
+
+    // Parse colour data for 4x4 block
+    const u32 subindices = getBits<0, 16, u32>(colourData);
+    const u32 negationFlags = getBits<16, 16, u32>(colourData);
+    const bool flip = getBit<32>(colourData);
+    const bool diffMode = getBit<33>(colourData);
+
+    // Note: index1 is indeed stored on the higher bits, with index2 in the lower bits
+    const u32 tableIndex1 = getBits<37, 3, u32>(colourData);
+    const u32 tableIndex2 = getBits<34, 3, u32>(colourData);
+    const u32 texelIndex = u * 4 + v; // Index of the texel in the block
+
+    if (flip)
+        std::swap(u, v);
+
+    s32 r, g, b;
+    if (diffMode) {
+        r = getBits<59, 5, s32>(colourData);
+        g = getBits<51, 5, s32>(colourData);
+        b = getBits<43, 5, s32>(colourData);
+
+        if (u >= 2) {
+            r += signExtend3To32(getBits<56, 3, u32>(colourData));
+            g += signExtend3To32(getBits<48, 3, u32>(colourData));
+            b += signExtend3To32(getBits<40, 3, u32>(colourData));
+        }
+
+        // Expand from 5 to 8 bits per channel
+        r = Colour::convert5To8Bit(r);
+        g = Colour::convert5To8Bit(g);
+        b = Colour::convert5To8Bit(b);
+    } else {
+        if (u < 2) {
+            r = getBits<60, 4, s32>(colourData);
+            g = getBits<52, 4, s32>(colourData);
+            b = getBits<44, 4, s32>(colourData);
+        } else {
+            r = getBits<56, 4, s32>(colourData);
+            g = getBits<48, 4, s32>(colourData);
+            b = getBits<40, 4, s32>(colourData);
+        }
+
+        // Expand from 4 to 8 bits per channel
+        r = Colour::convert4To8Bit(r);
+        g = Colour::convert4To8Bit(g);
+        b = Colour::convert4To8Bit(b);
+    }
+
+    const u32 index = (u < 2) ? tableIndex1 : tableIndex2;
+    s32 modifier = modifiers[index][(subindices >> texelIndex) & 1];
+
+    if (((negationFlags >> texelIndex) & 1) != 0) {
+        modifier = -modifier;
+    }
+
+    r = std::clamp(r + modifier, 0, 255);
+    g = std::clamp(g + modifier, 0, 255);
+    b = std::clamp(b + modifier, 0, 255);
+
+    return (alpha << 24) | (u32(b) << 16) | (u32(g) << 8) | u32(r);
+}
+
+} // namespace Metal
diff --git a/src/core/renderer_mtl/mtl_texture.cpp b/src/core/renderer_mtl/mtl_texture.cpp
new file mode 100644
index 00000000..b61c5502
--- /dev/null
+++ b/src/core/renderer_mtl/mtl_texture.cpp
@@ -0,0 +1,312 @@
+#include "renderer_mtl/mtl_texture.hpp"
+#include "renderer_mtl/objc_helper.hpp"
+#include "colour.hpp"
+#include <array>
+
+using namespace Helpers;
+
+namespace Metal {
+
+void Texture::allocate() {
+    formatInfo = PICA::getPixelFormatInfo(format);
+
+    MTL::TextureDescriptor* descriptor = MTL::TextureDescriptor::alloc()->init();
+    descriptor->setTextureType(MTL::TextureType2D);
+    descriptor->setPixelFormat(formatInfo.pixelFormat);
+    descriptor->setWidth(size.u());
+    descriptor->setHeight(size.v());
+    descriptor->setUsage(MTL::TextureUsageShaderRead);
+    descriptor->setStorageMode(MTL::StorageModeShared); // TODO: use private + staging buffers?
+    texture = device->newTexture(descriptor);
+    texture->setLabel(toNSString("Texture " + std::string(PICA::textureFormatToString(format)) + " " + std::to_string(size.u()) + "x" + std::to_string(size.v())));
+    descriptor->release();
+
+    setNewConfig(config);
+}
+
+// Set the texture's configuration, which includes min/mag filters, wrapping S/T modes, and so on
+void Texture::setNewConfig(u32 cfg) {
+    config = cfg;
+
+    if (sampler) {
+        sampler->release();
+    }
+
+    const auto magFilter = (cfg & 0x2) != 0 ? MTL::SamplerMinMagFilterLinear : MTL::SamplerMinMagFilterNearest;
+    const auto minFilter = (cfg & 0x4) != 0 ? MTL::SamplerMinMagFilterLinear : MTL::SamplerMinMagFilterNearest;
+    const auto wrapT = PICA::toMTLSamplerAddressMode(getBits<8, 3>(cfg));
+    const auto wrapS = PICA::toMTLSamplerAddressMode(getBits<12, 3>(cfg));
+
+    MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init();
+    samplerDescriptor->setMinFilter(minFilter);
+    samplerDescriptor->setMagFilter(magFilter);
+    samplerDescriptor->setSAddressMode(wrapS);
+    samplerDescriptor->setTAddressMode(wrapT);
+
+    samplerDescriptor->setLabel(toNSString("Sampler"));
+    sampler = device->newSamplerState(samplerDescriptor);
+    samplerDescriptor->release();
+}
+
+void Texture::free() {
+	valid = false;
+
+	if (texture) {
+		texture->release();
+	}
+	if (sampler) {
+        sampler->release();
+    }
+}
+
+u64 Texture::sizeInBytes() {
+    u64 pixelCount = u64(size.x()) * u64(size.y());
+
+    switch (format) {
+    case PICA::TextureFmt::RGBA8: // 4 bytes per pixel
+        return pixelCount * 4;
+
+    case PICA::TextureFmt::RGB8: // 3 bytes per pixel
+        return pixelCount * 3;
+
+    case PICA::TextureFmt::RGBA5551: // 2 bytes per pixel
+    case PICA::TextureFmt::RGB565:
+    case PICA::TextureFmt::RGBA4:
+    case PICA::TextureFmt::RG8:
+    case PICA::TextureFmt::IA8:
+        return pixelCount * 2;
+
+    case PICA::TextureFmt::A8: // 1 byte per pixel
+    case PICA::TextureFmt::I8:
+    case PICA::TextureFmt::IA4:
+        return pixelCount;
+
+    case PICA::TextureFmt::I4: // 4 bits per pixel
+    case PICA::TextureFmt::A4:
+        return pixelCount / 2;
+
+    case PICA::TextureFmt::ETC1: // Compressed formats
+    case PICA::TextureFmt::ETC1A4: {
+        // Number of 4x4 tiles
+        const u64 tileCount = pixelCount / 16;
+        // Tiles are 8 bytes each on ETC1 and 16 bytes each on ETC1A4
+        const u64 tileSize = format == PICA::TextureFmt::ETC1 ? 8 : 16;
+        return tileCount * tileSize;
+    }
+
+    default:
+        Helpers::panic("[PICA] Attempted to get size of invalid texture type");
+    }
+}
+
+// u and v are the UVs of the relevant texel
+// Texture data is stored interleaved in Morton order, ie in a Z - order curve as shown here
+// https://en.wikipedia.org/wiki/Z-order_curve
+// Textures are split into 8x8 tiles.This function returns the in - tile offset depending on the u & v of the texel
+// The in - tile offset is the sum of 2 offsets, one depending on the value of u % 8 and the other on the value of y % 8
+// As documented in this picture https ://en.wikipedia.org/wiki/File:Moser%E2%80%93de_Bruijn_addition.svg
+u32 Texture::mortonInterleave(u32 u, u32 v) {
+    static constexpr u32 xOffsets[] = { 0, 1, 4, 5, 16, 17, 20, 21 };
+    static constexpr u32 yOffsets[] = { 0, 2, 8, 10, 32, 34, 40, 42 };
+
+    return xOffsets[u & 7] + yOffsets[v & 7];
+}
+
+// Get the byte offset of texel (u, v) in the texture
+u32 Texture::getSwizzledOffset(u32 u, u32 v, u32 width, u32 bytesPerPixel) {
+    u32 offset = ((u & ~7) * 8) + ((v & ~7) * width); // Offset of the 8x8 tile the texel belongs to
+    offset += mortonInterleave(u, v); // Add the in-tile offset of the texel
+
+    return offset * bytesPerPixel;
+}
+
+// Same as the above code except we need to divide by 2 because 4 bits is smaller than a byte
+u32 Texture::getSwizzledOffset_4bpp(u32 u, u32 v, u32 width) {
+    u32 offset = ((u & ~7) * 8) + ((v & ~7) * width); // Offset of the 8x8 tile the texel belongs to
+    offset += mortonInterleave(u, v); // Add the in-tile offset of the texel
+
+    return offset / 2;
+}
+
+u8 Texture::decodeTexelU8(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data) {
+    switch (fmt) {
+        case PICA::TextureFmt::A4: {
+            const u32 offset = getSwizzledOffset_4bpp(u, v, size.u());
+
+            // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates
+            u8 alpha = data[offset] >> ((u % 2) ? 4 : 0);
+            alpha = Colour::convert4To8Bit(getBits<0, 4>(alpha));
+
+            // A8
+            return alpha;
+        }
+
+        case PICA::TextureFmt::A8: {
+            u32 offset = getSwizzledOffset(u, v, size.u(), 1);
+            const u8 alpha = data[offset];
+
+            // A8
+            return alpha;
+        }
+
+        default:
+            Helpers::panic("[Texture::DecodeTexel] Unimplemented format = %d", static_cast<int>(fmt));
+    }
+}
+
+u16 Texture::decodeTexelU16(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data) {
+    switch (fmt) {
+        case PICA::TextureFmt::RG8: {
+            u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            constexpr u8 b = 0;
+            const u8 g = data[offset];
+            const u8 r = data[offset + 1];
+
+            // RG8
+            return (g << 8) | r;
+        }
+
+        case PICA::TextureFmt::RGBA4: {
+            u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
+
+            u8 alpha = getBits<0, 4, u8>(texel);
+            u8 b = getBits<4, 4, u8>(texel);
+            u8 g = getBits<8, 4, u8>(texel);
+            u8 r = getBits<12, 4, u8>(texel);
+
+            // ABGR4
+            return (r << 12) | (g << 8) | (b << 4) | alpha;
+        }
+
+        case PICA::TextureFmt::RGBA5551: {
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
+
+            u8 alpha = getBit<0>(texel) ? 0xff : 0;
+            u8 b = getBits<1, 5, u8>(texel);
+            u8 g = getBits<6, 5, u8>(texel);
+            u8 r = getBits<11, 5, u8>(texel);
+
+            // BGR5A1
+            return (alpha << 15) | (r << 10) | (g << 5) | b;
+        }
+
+        case PICA::TextureFmt::RGB565: {
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+            const u16 texel = u16(data[offset]) | (u16(data[offset + 1]) << 8);
+
+            const u8 b = getBits<0, 5, u8>(texel);
+            const u8 g = getBits<5, 6, u8>(texel);
+            const u8 r = getBits<11, 5, u8>(texel);
+
+            // B5G6R5
+            return (r << 11) | (g << 5) | b;
+        }
+
+        case PICA::TextureFmt::IA4: {
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 1);
+            const u8 texel = data[offset];
+            const u8 alpha = texel & 0xf;
+            const u8 intensity = texel >> 4;
+
+            // ABGR4
+            return (intensity << 12) | (intensity << 8) | (intensity << 4) | alpha;
+        }
+
+        case PICA::TextureFmt::I4: {
+            u32 offset = getSwizzledOffset_4bpp(u, v, size.u());
+
+            // For odd U coordinates, grab the top 4 bits, and the low 4 bits for even coordinates
+            u8 intensity = data[offset] >> ((u % 2) ? 4 : 0);
+            intensity = getBits<0, 4>(intensity);
+
+            // ABGR4
+            return (intensity << 12) | (intensity << 8) | (intensity << 4) | 0xff;
+        }
+
+        default:
+            Helpers::panic("[Texture::DecodeTexel] Unimplemented format = %d", static_cast<int>(fmt));
+    }
+}
+
+u32 Texture::decodeTexelU32(u32 u, u32 v, PICA::TextureFmt fmt, std::span<const u8> data) {
+    switch (fmt) {
+        case PICA::TextureFmt::RGB8: {
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 3);
+            const u8 b = data[offset];
+            const u8 g = data[offset + 1];
+            const u8 r = data[offset + 2];
+
+            // RGBA8
+            return (0xff << 24) | (b << 16) | (g << 8) | r;
+        }
+
+        case PICA::TextureFmt::RGBA8: {
+            const u32 offset = getSwizzledOffset(u, v, size.u(), 4);
+            const u8 alpha = data[offset];
+            const u8 b = data[offset + 1];
+            const u8 g = data[offset + 2];
+            const u8 r = data[offset + 3];
+
+            // RGBA8
+            return (alpha << 24) | (b << 16) | (g << 8) | r;
+        }
+
+        case PICA::TextureFmt::I8: {
+            u32 offset = getSwizzledOffset(u, v, size.u(), 1);
+            const u8 intensity = data[offset];
+
+            // RGBA8
+            return (0xff << 24) | (intensity << 16) | (intensity << 8) | intensity;
+        }
+
+        case PICA::TextureFmt::IA8: {
+            u32 offset = getSwizzledOffset(u, v, size.u(), 2);
+
+            // Same as I8 except each pixel gets its own alpha value too
+            const u8 alpha = data[offset];
+            const u8 intensity = data[offset + 1];
+
+            // RGBA8
+            return (alpha << 24) | (intensity << 16) | (intensity << 8) | intensity;
+        }
+
+        case PICA::TextureFmt::ETC1: return getTexelETC(false, u, v, size.u(), data);
+        case PICA::TextureFmt::ETC1A4: return getTexelETC(true, u, v, size.u(), data);
+
+        default:
+            Helpers::panic("[Texture::DecodeTexel] Unimplemented format = %d", static_cast<int>(fmt));
+    }
+}
+
+void Texture::decodeTexture(std::span<const u8> data) {
+    std::vector<u8> decoded;
+    decoded.reserve(u64(size.u()) * u64(size.v()) * formatInfo.bytesPerTexel);
+
+    // Decode texels line by line
+    for (u32 v = 0; v < size.v(); v++) {
+        for (u32 u = 0; u < size.u(); u++) {
+            if (formatInfo.bytesPerTexel == 1) {
+                u8 texel = decodeTexelU8(u, v, format, data);
+                decoded.push_back(texel);
+            } else if (formatInfo.bytesPerTexel == 2) {
+                u16 texel = decodeTexelU16(u, v, format, data);
+                decoded.push_back((texel & 0x00ff) >> 0);
+                decoded.push_back((texel & 0xff00) >> 8);
+            } else if (formatInfo.bytesPerTexel == 4) {
+                u32 texel = decodeTexelU32(u, v, format, data);
+                decoded.push_back((texel & 0x000000ff) >> 0);
+                decoded.push_back((texel & 0x0000ff00) >> 8);
+                decoded.push_back((texel & 0x00ff0000) >> 16);
+                decoded.push_back((texel & 0xff000000) >> 24);
+            } else {
+                Helpers::panic("[Texture::decodeTexture] Unimplemented bytesPerTexel (%u)", formatInfo.bytesPerTexel);
+            }
+        }
+    }
+
+    texture->replaceRegion(MTL::Region(0, 0, size.u(), size.v()), 0, 0, decoded.data(), formatInfo.bytesPerTexel * size.u(), 0);
+}
+
+} // namespace Metal
diff --git a/src/core/renderer_mtl/objc_helper.mm b/src/core/renderer_mtl/objc_helper.mm
new file mode 100644
index 00000000..eeea56a0
--- /dev/null
+++ b/src/core/renderer_mtl/objc_helper.mm
@@ -0,0 +1,12 @@
+#include "renderer_mtl/objc_helper.hpp"
+
+// TODO: change the include
+#import <Metal/Metal.h>
+
+namespace Metal {
+
+dispatch_data_t createDispatchData(const void* data, size_t size) {
+    return dispatch_data_create(data, size, dispatch_get_global_queue(0, 0), ^{});
+}
+
+} // namespace Metal
diff --git a/src/core/renderer_mtl/renderer_mtl.cpp b/src/core/renderer_mtl/renderer_mtl.cpp
new file mode 100644
index 00000000..10bca5dd
--- /dev/null
+++ b/src/core/renderer_mtl/renderer_mtl.cpp
@@ -0,0 +1,774 @@
+#include "PICA/gpu.hpp"
+#include "renderer_mtl/renderer_mtl.hpp"
+#include "renderer_mtl/objc_helper.hpp"
+
+#include <cmrc/cmrc.hpp>
+#include <cstddef>
+
+#include "SDL_metal.h"
+
+using namespace PICA;
+
+CMRC_DECLARE(RendererMTL);
+
+const u16 LIGHT_LUT_TEXTURE_WIDTH = 256;
+
+// HACK: redefinition...
+PICA::ColorFmt ToColorFormat(u32 format) {
+	switch (format) {
+		case 2: return PICA::ColorFmt::RGB565;
+		case 3: return PICA::ColorFmt::RGBA5551;
+		default: return static_cast<PICA::ColorFmt>(format);
+	}
+}
+
+MTL::Library* loadLibrary(MTL::Device* device, const cmrc::file& shaderSource) {
+	//MTL::CompileOptions* compileOptions = MTL::CompileOptions::alloc()->init();
+	NS::Error* error = nullptr;
+	MTL::Library* library = device->newLibrary(Metal::createDispatchData(shaderSource.begin(), shaderSource.size()), &error);
+	//MTL::Library* library = device->newLibrary(NS::String::string(source.c_str(), NS::ASCIIStringEncoding), compileOptions, &error);
+	if (error) {
+		Helpers::panic("Error loading shaders: %s", error->description()->cString(NS::ASCIIStringEncoding));
+	}
+
+	return library;
+}
+
+RendererMTL::RendererMTL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
+	: Renderer(gpu, internalRegs, externalRegs) {}
+RendererMTL::~RendererMTL() {}
+
+void RendererMTL::reset() {
+    vertexBufferCache.reset();
+    depthStencilCache.reset();
+    drawPipelineCache.reset();
+    blitPipelineCache.reset();
+    textureCache.reset();
+    depthStencilRenderTargetCache.reset();
+	colorRenderTargetCache.reset();
+}
+
+void RendererMTL::display() {
+	CA::MetalDrawable* drawable = metalLayer->nextDrawable();
+	if (!drawable) {
+        return;
+	}
+
+	using namespace PICA::ExternalRegs;
+
+	// Top screen
+	const u32 topActiveFb = externalRegs[Framebuffer0Select] & 1;
+	const u32 topScreenAddr = externalRegs[topActiveFb == 0 ? Framebuffer0AFirstAddr : Framebuffer0ASecondAddr];
+	auto topScreen = colorRenderTargetCache.findFromAddress(topScreenAddr);
+
+	if (topScreen) {
+	    clearColor(nullptr, topScreen->get().texture);
+	}
+
+	// Bottom screen
+	const u32 bottomActiveFb = externalRegs[Framebuffer1Select] & 1;
+	const u32 bottomScreenAddr = externalRegs[bottomActiveFb == 0 ? Framebuffer1AFirstAddr : Framebuffer1ASecondAddr];
+	auto bottomScreen = colorRenderTargetCache.findFromAddress(bottomScreenAddr);
+
+	if (bottomScreen) {
+        clearColor(nullptr, bottomScreen->get().texture);
+	}
+
+	// -------- Draw --------
+	commandBuffer->pushDebugGroup(toNSString("Display"));
+
+	MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
+	MTL::RenderPassColorAttachmentDescriptor* colorAttachment = renderPassDescriptor->colorAttachments()->object(0);
+	colorAttachment->setTexture(drawable->texture());
+	colorAttachment->setLoadAction(MTL::LoadActionClear);
+	colorAttachment->setClearColor(MTL::ClearColor{0.0f, 0.0f, 0.0f, 1.0f});
+	colorAttachment->setStoreAction(MTL::StoreActionStore);
+
+	nextRenderPassName = "Display";
+	beginRenderPassIfNeeded(renderPassDescriptor, false, drawable->texture());
+	renderCommandEncoder->setRenderPipelineState(displayPipeline);
+	renderCommandEncoder->setFragmentSamplerState(nearestSampler, 0);
+
+	// Top screen
+	if (topScreen) {
+		renderCommandEncoder->setViewport(MTL::Viewport{0, 0, 400, 240, 0.0f, 1.0f});
+		renderCommandEncoder->setFragmentTexture(topScreen->get().texture, 0);
+		renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), NS::UInteger(4));
+	}
+
+	// Bottom screen
+	if (bottomScreen) {
+		renderCommandEncoder->setViewport(MTL::Viewport{40, 240, 320, 240, 0.0f, 1.0f});
+		renderCommandEncoder->setFragmentTexture(bottomScreen->get().texture, 0);
+		renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), NS::UInteger(4));
+	}
+
+	endRenderPass();
+
+	commandBuffer->presentDrawable(drawable);
+
+	commandBuffer->popDebugGroup();
+
+	commitCommandBuffer();
+
+	// Inform the vertex buffer cache that the frame ended
+	vertexBufferCache.endFrame();
+
+	// Release
+	drawable->release();
+}
+
+void RendererMTL::initGraphicsContext(SDL_Window* window) {
+	// TODO: what should be the type of the view?
+	void* view = SDL_Metal_CreateView(window);
+	metalLayer = (CA::MetalLayer*)SDL_Metal_GetLayer(view);
+	device = MTL::CreateSystemDefaultDevice();
+	metalLayer->setDevice(device);
+	commandQueue = device->newCommandQueue();
+
+	// -------- Objects --------
+
+	// Textures
+	MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init();
+	textureDescriptor->setTextureType(MTL::TextureType2D);
+	textureDescriptor->setPixelFormat(MTL::PixelFormatRGBA32Float);
+	textureDescriptor->setWidth(LIGHT_LUT_TEXTURE_WIDTH);
+	textureDescriptor->setHeight(Lights::LUT_Count + 1);
+	textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageShaderWrite);
+	textureDescriptor->setStorageMode(MTL::StorageModePrivate);
+
+	lutTexture = device->newTexture(textureDescriptor);
+	lutTexture->setLabel(toNSString("LUT texture"));
+	textureDescriptor->release();
+
+	// Samplers
+	MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init();
+	samplerDescriptor->setLabel(toNSString("Sampler (nearest)"));
+	nearestSampler = device->newSamplerState(samplerDescriptor);
+
+	samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear);
+	samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear);
+	samplerDescriptor->setLabel(toNSString("Sampler (linear)"));
+	linearSampler = device->newSamplerState(samplerDescriptor);
+
+	samplerDescriptor->release();
+
+	// -------- Pipelines --------
+
+	// Load shaders
+	auto mtlResources = cmrc::RendererMTL::get_filesystem();
+	library = loadLibrary(device, mtlResources.open("metal_shaders.metallib"));
+	MTL::Library* copyToLutTextureLibrary = loadLibrary(device, mtlResources.open("metal_copy_to_lut_texture.metallib"));
+
+	// Display
+	MTL::Function* vertexDisplayFunction = library->newFunction(NS::String::string("vertexDisplay", NS::ASCIIStringEncoding));
+	MTL::Function* fragmentDisplayFunction = library->newFunction(NS::String::string("fragmentDisplay", NS::ASCIIStringEncoding));
+
+	MTL::RenderPipelineDescriptor* displayPipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init();
+	displayPipelineDescriptor->setVertexFunction(vertexDisplayFunction);
+	displayPipelineDescriptor->setFragmentFunction(fragmentDisplayFunction);
+	auto* displayColorAttachment = displayPipelineDescriptor->colorAttachments()->object(0);
+	displayColorAttachment->setPixelFormat(MTL::PixelFormat::PixelFormatBGRA8Unorm);
+
+	NS::Error* error = nullptr;
+	displayPipelineDescriptor->setLabel(toNSString("Display pipeline"));
+	displayPipeline = device->newRenderPipelineState(displayPipelineDescriptor, &error);
+	if (error) {
+		Helpers::panic("Error creating display pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+	}
+	displayPipelineDescriptor->release();
+	vertexDisplayFunction->release();
+	fragmentDisplayFunction->release();
+
+	// Blit
+	MTL::Function* vertexBlitFunction = library->newFunction(NS::String::string("vertexBlit", NS::ASCIIStringEncoding));
+	MTL::Function* fragmentBlitFunction = library->newFunction(NS::String::string("fragmentBlit", NS::ASCIIStringEncoding));
+
+	blitPipelineCache.set(device, vertexBlitFunction, fragmentBlitFunction);
+
+	// Draw
+	MTL::Function* vertexDrawFunction = library->newFunction(NS::String::string("vertexDraw", NS::ASCIIStringEncoding));
+
+	// -------- Vertex descriptor --------
+	MTL::VertexDescriptor* vertexDescriptor = MTL::VertexDescriptor::alloc()->init();
+
+	// Position
+	MTL::VertexAttributeDescriptor* positionAttribute = vertexDescriptor->attributes()->object(0);
+	positionAttribute->setFormat(MTL::VertexFormatFloat4);
+	positionAttribute->setOffset(offsetof(Vertex, s.positions));
+	positionAttribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Quaternion
+	MTL::VertexAttributeDescriptor* quaternionAttribute = vertexDescriptor->attributes()->object(1);
+	quaternionAttribute->setFormat(MTL::VertexFormatFloat4);
+	quaternionAttribute->setOffset(offsetof(Vertex, s.quaternion));
+	quaternionAttribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Color
+	MTL::VertexAttributeDescriptor* colorAttribute = vertexDescriptor->attributes()->object(2);
+	colorAttribute->setFormat(MTL::VertexFormatFloat4);
+	colorAttribute->setOffset(offsetof(Vertex, s.colour));
+	colorAttribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Texture coordinate 0
+	MTL::VertexAttributeDescriptor* texCoord0Attribute = vertexDescriptor->attributes()->object(3);
+	texCoord0Attribute->setFormat(MTL::VertexFormatFloat2);
+	texCoord0Attribute->setOffset(offsetof(Vertex, s.texcoord0));
+	texCoord0Attribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Texture coordinate 1
+	MTL::VertexAttributeDescriptor* texCoord1Attribute = vertexDescriptor->attributes()->object(4);
+	texCoord1Attribute->setFormat(MTL::VertexFormatFloat2);
+	texCoord1Attribute->setOffset(offsetof(Vertex, s.texcoord1));
+	texCoord1Attribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Texture coordinate 0 W
+	MTL::VertexAttributeDescriptor* texCoord0WAttribute = vertexDescriptor->attributes()->object(5);
+	texCoord0WAttribute->setFormat(MTL::VertexFormatFloat);
+	texCoord0WAttribute->setOffset(offsetof(Vertex, s.texcoord0_w));
+	texCoord0WAttribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// View
+	MTL::VertexAttributeDescriptor* viewAttribute = vertexDescriptor->attributes()->object(6);
+	viewAttribute->setFormat(MTL::VertexFormatFloat3);
+	viewAttribute->setOffset(offsetof(Vertex, s.view));
+	viewAttribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	// Texture coordinate 2
+	MTL::VertexAttributeDescriptor* texCoord2Attribute = vertexDescriptor->attributes()->object(7);
+	texCoord2Attribute->setFormat(MTL::VertexFormatFloat2);
+	texCoord2Attribute->setOffset(offsetof(Vertex, s.texcoord2));
+	texCoord2Attribute->setBufferIndex(VERTEX_BUFFER_BINDING_INDEX);
+
+	MTL::VertexBufferLayoutDescriptor* vertexBufferLayout = vertexDescriptor->layouts()->object(VERTEX_BUFFER_BINDING_INDEX);
+	vertexBufferLayout->setStride(sizeof(Vertex));
+	vertexBufferLayout->setStepFunction(MTL::VertexStepFunctionPerVertex);
+	vertexBufferLayout->setStepRate(1);
+
+	drawPipelineCache.set(device, library, vertexDrawFunction, vertexDescriptor);
+
+	// Copy to LUT texture
+	MTL::FunctionConstantValues* constants = MTL::FunctionConstantValues::alloc()->init();
+    constants->setConstantValue(&LIGHT_LUT_TEXTURE_WIDTH, MTL::DataTypeUShort, NS::UInteger(0));
+
+    error = nullptr;
+    MTL::Function* vertexCopyToLutTextureFunction = copyToLutTextureLibrary->newFunction(NS::String::string("vertexCopyToLutTexture", NS::ASCIIStringEncoding), constants, &error);
+    if (error) {
+        Helpers::panic("Error creating copy_to_lut_texture vertex function: %s", error->description()->cString(NS::ASCIIStringEncoding));
+    }
+    constants->release();
+
+	MTL::RenderPipelineDescriptor* copyToLutTexturePipelineDescriptor = MTL::RenderPipelineDescriptor::alloc()->init();
+	copyToLutTexturePipelineDescriptor->setVertexFunction(vertexCopyToLutTextureFunction);
+	// Disable rasterization
+	copyToLutTexturePipelineDescriptor->setRasterizationEnabled(false);
+
+	error = nullptr;
+	copyToLutTexturePipelineDescriptor->setLabel(toNSString("Copy to LUT texture pipeline"));
+	copyToLutTexturePipeline = device->newRenderPipelineState(copyToLutTexturePipelineDescriptor, &error);
+	if (error) {
+		Helpers::panic("Error creating copy_to_lut_texture pipeline state: %s", error->description()->cString(NS::ASCIIStringEncoding));
+	}
+	copyToLutTexturePipelineDescriptor->release();
+	vertexCopyToLutTextureFunction->release();
+
+	// Depth stencil cache
+	depthStencilCache.set(device);
+
+	// Vertex buffer cache
+	vertexBufferCache.set(device);
+
+	// -------- Depth stencil state --------
+	MTL::DepthStencilDescriptor* depthStencilDescriptor = MTL::DepthStencilDescriptor::alloc()->init();
+	depthStencilDescriptor->setLabel(toNSString("Default depth stencil state"));
+	defaultDepthStencilState = device->newDepthStencilState(depthStencilDescriptor);
+	depthStencilDescriptor->release();
+
+	// Release
+	copyToLutTextureLibrary->release();
+}
+
+void RendererMTL::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
+	const auto color = colorRenderTargetCache.findFromAddress(startAddress);
+	if (color) {
+		const float r = Helpers::getBits<24, 8>(value) / 255.0f;
+		const float g = Helpers::getBits<16, 8>(value) / 255.0f;
+		const float b = Helpers::getBits<8, 8>(value) / 255.0f;
+		const float a = (value & 0xff) / 255.0f;
+
+		colorClearOps[color->get().texture] = {r, g, b, a};
+
+		return;
+	}
+
+	const auto depth = depthStencilRenderTargetCache.findFromAddress(startAddress);
+	if (depth) {
+		float depthVal;
+		const auto format = depth->get().format;
+		if (format == DepthFmt::Depth16) {
+			depthVal = (value & 0xffff) / 65535.0f;
+		} else {
+			depthVal = (value & 0xffffff) / 16777215.0f;
+		}
+
+		depthClearOps[depth->get().texture] = depthVal;
+
+		if (format == DepthFmt::Depth24Stencil8) {
+            const u8 stencilVal = value >> 24;
+            stencilClearOps[depth->get().texture] = stencilVal;
+		}
+
+		return;
+	}
+
+	Helpers::warn("[RendererMTL::ClearBuffer] No buffer found!\n");
+}
+
+void RendererMTL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
+	const u32 inputWidth = inputSize & 0xffff;
+	const u32 inputHeight = inputSize >> 16;
+	const auto inputFormat = ToColorFormat(Helpers::getBits<8, 3>(flags));
+	const auto outputFormat = ToColorFormat(Helpers::getBits<12, 3>(flags));
+	const bool verticalFlip = flags & 1;
+	const PICA::Scaling scaling = static_cast<PICA::Scaling>(Helpers::getBits<24, 2>(flags));
+
+	u32 outputWidth = outputSize & 0xffff;
+	u32 outputHeight = outputSize >> 16;
+
+	auto srcFramebuffer = getColorRenderTarget(inputAddr, inputFormat, inputWidth, outputHeight);
+	nextRenderPassName = "Clear before display transfer";
+	clearColor(nullptr, srcFramebuffer->texture);
+	Math::Rect<u32> srcRect = srcFramebuffer->getSubRect(inputAddr, outputWidth, outputHeight);
+
+	if (verticalFlip) {
+		std::swap(srcRect.bottom, srcRect.top);
+	}
+
+	// Apply scaling for the destination rectangle.
+	if (scaling == PICA::Scaling::X || scaling == PICA::Scaling::XY) {
+		outputWidth >>= 1;
+	}
+
+	if (scaling == PICA::Scaling::XY) {
+		outputHeight >>= 1;
+	}
+
+	auto destFramebuffer = getColorRenderTarget(outputAddr, outputFormat, outputWidth, outputHeight);
+	// TODO: clear if not blitting to the whole framebuffer
+	Math::Rect<u32> destRect = destFramebuffer->getSubRect(outputAddr, outputWidth, outputHeight);
+
+	if (inputWidth != outputWidth) {
+		// Helpers::warn("Strided display transfer is not handled correctly!\n");
+	}
+
+	textureCopyImpl(*srcFramebuffer, *destFramebuffer, srcRect, destRect);
+}
+
+void RendererMTL::textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {
+    // Texture copy size is aligned to 16 byte units
+	const u32 copySize = totalBytes & ~0xf;
+	if (copySize == 0) {
+		Helpers::warn("TextureCopy total bytes less than 16!\n");
+		return;
+	}
+
+	// The width and gap are provided in 16-byte units.
+	const u32 inputWidth = (inputSize & 0xffff) << 4;
+	const u32 inputGap = (inputSize >> 16) << 4;
+	const u32 outputWidth = (outputSize & 0xffff) << 4;
+	const u32 outputGap = (outputSize >> 16) << 4;
+
+	if (inputGap != 0 || outputGap != 0) {
+		// Helpers::warn("Strided texture copy\n");
+	}
+
+	if (inputWidth != outputWidth) {
+		Helpers::warn("Input width does not match output width, cannot accelerate texture copy!");
+		return;
+	}
+
+	// Texture copy is a raw data copy in PICA, which means no format or tiling information is provided to the engine.
+	// Depending if the target surface is linear or tiled, games set inputWidth to either the width of the texture or
+	// the width multiplied by eight (because tiles are stored linearly in memory).
+	// To properly accelerate this we must examine each surface individually. For now we assume the most common case
+	// of tiled surface with RGBA8 format. If our assumption does not hold true, we abort the texture copy as inserting
+	// that surface is not correct.
+
+	// We assume the source surface is tiled and RGBA8. inputWidth is in bytes so divide it
+	// by eight * sizePerPixel(RGBA8) to convert it to a useable width.
+	const u32 bpp = sizePerPixel(PICA::ColorFmt::RGBA8);
+	const u32 copyStride = (inputWidth + inputGap) / (8 * bpp);
+	const u32 copyWidth = inputWidth / (8 * bpp);
+
+	// inputHeight/outputHeight are typically set to zero so they cannot be used to get the height of the copy region
+	// in contrast to display transfer. Compute height manually by dividing the copy size with the copy width. The result
+	// is the number of vertical tiles so multiply that by eight to get the actual copy height.
+	u32 copyHeight;
+	if (inputWidth != 0) [[likely]] {
+		copyHeight = (copySize / inputWidth) * 8;
+	} else {
+		copyHeight = 0;
+	}
+
+	// Find the source surface.
+	auto srcFramebuffer = getColorRenderTarget(inputAddr, PICA::ColorFmt::RGBA8, copyStride, copyHeight, false);
+	if (!srcFramebuffer) {
+		Helpers::warn("RendererGL::TextureCopy failed to locate src framebuffer!\n");
+		return;
+	}
+	nextRenderPassName = "Clear before texture copy";
+	clearColor(nullptr, srcFramebuffer->texture);
+
+	Math::Rect<u32> srcRect = srcFramebuffer->getSubRect(inputAddr, copyWidth, copyHeight);
+
+	// Assume the destination surface has the same format. Unless the surfaces have the same block width,
+	// texture copy does not make sense.
+	auto destFramebuffer = getColorRenderTarget(outputAddr, srcFramebuffer->format, copyWidth, copyHeight);
+	// TODO: clear if not blitting to the whole framebuffer
+	Math::Rect<u32> destRect = destFramebuffer->getSubRect(outputAddr, copyWidth, copyHeight);
+
+	textureCopyImpl(*srcFramebuffer, *destFramebuffer, srcRect, destRect);
+}
+
+void RendererMTL::drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) {
+	// Color
+	auto colorRenderTarget = getColorRenderTarget(colourBufferLoc, colourBufferFormat, fbSize[0], fbSize[1]);
+
+	// Depth stencil
+	const u32 depthControl = regs[PICA::InternalRegs::DepthAndColorMask];
+	const bool depthStencilWrite = regs[PICA::InternalRegs::DepthBufferWrite];
+	const bool depthEnable = depthControl & 0x1;
+	const bool depthWriteEnable = Helpers::getBit<12>(depthControl);
+	const u8 depthFunc = Helpers::getBits<4, 3>(depthControl);
+	const u8 colorMask = Helpers::getBits<8, 4>(depthControl);
+
+	Metal::DepthStencilHash depthStencilHash{false, 1};
+	depthStencilHash.stencilConfig = regs[PICA::InternalRegs::StencilTest];
+	depthStencilHash.stencilOpConfig = regs[PICA::InternalRegs::StencilOp];
+	const bool stencilEnable = Helpers::getBit<0>(depthStencilHash.stencilConfig);
+
+	std::optional<Metal::DepthStencilRenderTarget> depthStencilRenderTarget = std::nullopt;
+	if (depthEnable) {
+		depthStencilHash.depthStencilWrite = depthWriteEnable && depthStencilWrite;
+		depthStencilHash.depthFunc = depthFunc;
+		depthStencilRenderTarget = getDepthRenderTarget();
+	} else {
+		if (depthWriteEnable) {
+			depthStencilHash.depthStencilWrite = true;
+			depthStencilRenderTarget = getDepthRenderTarget();
+		} else if (stencilEnable) {
+			depthStencilRenderTarget = getDepthRenderTarget();
+		}
+	}
+
+	// Depth uniforms
+	struct {
+        float depthScale;
+       	float depthOffset;
+       	bool depthMapEnable;
+	} depthUniforms;
+	depthUniforms.depthScale = Floats::f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
+   	depthUniforms.depthOffset = Floats::f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
+   	depthUniforms.depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
+
+	// -------- Pipeline --------
+	Metal::DrawPipelineHash pipelineHash{colorRenderTarget->format, DepthFmt::Unknown1};
+	if (depthStencilRenderTarget) {
+        pipelineHash.depthFmt = depthStencilRenderTarget->format;
+    }
+    pipelineHash.fragHash.lightingEnabled = regs[0x008F] & 1;
+    pipelineHash.fragHash.lightingNumLights = regs[0x01C2] & 0x7;
+    pipelineHash.fragHash.lightingConfig1 = regs[0x01C4u];
+    pipelineHash.fragHash.alphaControl = regs[0x104];
+
+	// Blending and logic op
+	pipelineHash.blendEnabled = (regs[PICA::InternalRegs::ColourOperation] & (1 << 8)) != 0;
+	pipelineHash.colorWriteMask = colorMask;
+
+	u8 logicOp = 3; // Copy, which doesn't do anything
+	if (pipelineHash.blendEnabled) {
+    	pipelineHash.blendControl = regs[PICA::InternalRegs::BlendFunc];
+	} else {
+	    logicOp = Helpers::getBits<0, 4>(regs[PICA::InternalRegs::LogicOp]);
+	}
+
+	MTL::RenderPipelineState* pipeline = drawPipelineCache.get(pipelineHash);
+
+	// Depth stencil state
+	MTL::DepthStencilState* depthStencilState = depthStencilCache.get(depthStencilHash);
+
+	// -------- Render --------
+	MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
+	bool doesClear = clearColor(renderPassDescriptor, colorRenderTarget->texture);
+    if (depthStencilRenderTarget) {
+        if (clearDepth(renderPassDescriptor, depthStencilRenderTarget->texture))
+            doesClear = true;
+        if (depthStencilRenderTarget->format == DepthFmt::Depth24Stencil8) {
+            if (clearStencil(renderPassDescriptor, depthStencilRenderTarget->texture))
+                doesClear = true;
+        }
+    }
+
+    nextRenderPassName = "Draw vertices";
+	beginRenderPassIfNeeded(renderPassDescriptor, doesClear, colorRenderTarget->texture, (depthStencilRenderTarget ? depthStencilRenderTarget->texture : nullptr));
+
+	// Update the LUT texture if necessary
+	if (gpu.lightingLUTDirty) {
+		updateLightingLUT(renderCommandEncoder);
+	}
+	if (gpu.fogLUTDirty) {
+        updateFogLUT(renderCommandEncoder);
+    }
+
+	renderCommandEncoder->setRenderPipelineState(pipeline);
+	renderCommandEncoder->setDepthStencilState(depthStencilState);
+	// If size is < 4KB, use inline vertex data, otherwise use a buffer
+	if (vertices.size_bytes() < 4 * 1024) {
+		renderCommandEncoder->setVertexBytes(vertices.data(), vertices.size_bytes(), VERTEX_BUFFER_BINDING_INDEX);
+	} else {
+	    Metal::BufferHandle buffer = vertexBufferCache.get(vertices.data(), vertices.size_bytes());
+		renderCommandEncoder->setVertexBuffer(buffer.buffer, buffer.offset, VERTEX_BUFFER_BINDING_INDEX);
+	}
+
+	// Viewport
+	const u32 viewportX = regs[PICA::InternalRegs::ViewportXY] & 0x3ff;
+	const u32 viewportY = (regs[PICA::InternalRegs::ViewportXY] >> 16) & 0x3ff;
+	const u32 viewportWidth = Floats::f24::fromRaw(regs[PICA::InternalRegs::ViewportWidth] & 0xffffff).toFloat32() * 2.0f;
+	const u32 viewportHeight = Floats::f24::fromRaw(regs[PICA::InternalRegs::ViewportHeight] & 0xffffff).toFloat32() * 2.0f;
+	const auto rect = colorRenderTarget->getSubRect(colourBufferLoc, fbSize[0], fbSize[1]);
+	MTL::Viewport viewport{double(rect.left + viewportX), double(rect.bottom + viewportY), double(viewportWidth), double(viewportHeight), 0.0, 1.0};
+	renderCommandEncoder->setViewport(viewport);
+
+	// Blend color
+	if (pipelineHash.blendEnabled) {
+       	u32 constantColor = regs[PICA::InternalRegs::BlendColour];
+    	const u8 r = constantColor & 0xff;
+    	const u8 g = Helpers::getBits<8, 8>(constantColor);
+    	const u8 b = Helpers::getBits<16, 8>(constantColor);
+    	const u8 a = Helpers::getBits<24, 8>(constantColor);
+
+        renderCommandEncoder->setBlendColor(r / 255.0f, g / 255.0f, b / 255.0f, a / 255.0f);
+	}
+
+	// Stencil reference
+	if (stencilEnable) {
+	    const s8 reference = s8(Helpers::getBits<16, 8>(depthStencilHash.stencilConfig)); // Signed reference value
+        renderCommandEncoder->setStencilReferenceValue(reference);
+    }
+
+	// Bind resources
+	setupTextureEnvState(renderCommandEncoder);
+	bindTexturesToSlots(renderCommandEncoder);
+	renderCommandEncoder->setVertexBytes(&regs[0x48], (0x200 - 0x48) * sizeof(regs[0]), 0);
+	renderCommandEncoder->setFragmentBytes(&regs[0x48], (0x200 - 0x48) * sizeof(regs[0]), 0);
+	renderCommandEncoder->setVertexBytes(&depthUniforms, sizeof(depthUniforms), 2);
+	renderCommandEncoder->setFragmentBytes(&logicOp, sizeof(logicOp), 2);
+
+	renderCommandEncoder->drawPrimitives(toMTLPrimitiveType(primType), NS::UInteger(0), NS::UInteger(vertices.size()));
+}
+
+void RendererMTL::screenshot(const std::string& name) {
+	// TODO: implement
+	Helpers::warn("RendererMTL::screenshot not implemented");
+}
+
+void RendererMTL::deinitGraphicsContext() {
+	reset();
+
+	// Release
+	copyToLutTexturePipeline->release();
+	displayPipeline->release();
+	defaultDepthStencilState->release();
+	lutTexture->release();
+	linearSampler->release();
+	nearestSampler->release();
+	library->release();
+	commandQueue->release();
+	device->release();
+}
+
+std::optional<Metal::ColorRenderTarget> RendererMTL::getColorRenderTarget(
+	u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound
+) {
+	// Try to find an already existing buffer that contains the provided address
+	// This is a more relaxed check compared to getColourFBO as display transfer/texcopy may refer to
+	// subrect of a surface and in case of texcopy we don't know the format of the surface.
+	auto buffer = colorRenderTargetCache.findFromAddress(addr);
+	if (buffer.has_value()) {
+		return buffer.value().get();
+	}
+
+	if (!createIfnotFound) {
+		return std::nullopt;
+	}
+
+	// Otherwise create and cache a new buffer.
+	Metal::ColorRenderTarget sampleBuffer(device, addr, format, width, height);
+
+	return colorRenderTargetCache.add(sampleBuffer);
+}
+
+Metal::DepthStencilRenderTarget& RendererMTL::getDepthRenderTarget() {
+	Metal::DepthStencilRenderTarget sampleBuffer(device, depthBufferLoc, depthBufferFormat, fbSize[0], fbSize[1]);
+	auto buffer = depthStencilRenderTargetCache.find(sampleBuffer);
+
+	if (buffer.has_value()) {
+		return buffer.value().get();
+	} else {
+		return depthStencilRenderTargetCache.add(sampleBuffer);
+	}
+}
+
+Metal::Texture& RendererMTL::getTexture(Metal::Texture& tex) {
+	auto buffer = textureCache.find(tex);
+
+	if (buffer.has_value()) {
+		return buffer.value().get();
+	} else {
+		const auto textureData = std::span{gpu.getPointerPhys<u8>(tex.location), tex.sizeInBytes()};  // Get pointer to the texture data in 3DS memory
+		Metal::Texture& newTex = textureCache.add(tex);
+		newTex.decodeTexture(textureData);
+
+		return newTex;
+	}
+}
+
+void RendererMTL::setupTextureEnvState(MTL::RenderCommandEncoder* encoder) {
+	static constexpr std::array<u32, 6> ioBases = {
+		PICA::InternalRegs::TexEnv0Source, PICA::InternalRegs::TexEnv1Source, PICA::InternalRegs::TexEnv2Source,
+		PICA::InternalRegs::TexEnv3Source, PICA::InternalRegs::TexEnv4Source, PICA::InternalRegs::TexEnv5Source,
+	};
+
+	struct {
+		u32 textureEnvSourceRegs[6];
+		u32 textureEnvOperandRegs[6];
+		u32 textureEnvCombinerRegs[6];
+		u32 textureEnvScaleRegs[6];
+	} envState;
+	u32 textureEnvColourRegs[6];
+
+	for (int i = 0; i < 6; i++) {
+		const u32 ioBase = ioBases[i];
+
+		envState.textureEnvSourceRegs[i] = regs[ioBase];
+		envState.textureEnvOperandRegs[i] = regs[ioBase + 1];
+		envState.textureEnvCombinerRegs[i] = regs[ioBase + 2];
+		textureEnvColourRegs[i] = regs[ioBase + 3];
+		envState.textureEnvScaleRegs[i] = regs[ioBase + 4];
+	}
+
+	encoder->setVertexBytes(&textureEnvColourRegs, sizeof(textureEnvColourRegs), 1);
+	encoder->setFragmentBytes(&envState, sizeof(envState), 1);
+}
+
+void RendererMTL::bindTexturesToSlots(MTL::RenderCommandEncoder* encoder) {
+	static constexpr std::array<u32, 3> ioBases = {
+		PICA::InternalRegs::Tex0BorderColor,
+		PICA::InternalRegs::Tex1BorderColor,
+		PICA::InternalRegs::Tex2BorderColor,
+	};
+
+	for (int i = 0; i < 3; i++) {
+		if ((regs[PICA::InternalRegs::TexUnitCfg] & (1 << i)) == 0) {
+			continue;
+		}
+
+		const size_t ioBase = ioBases[i];
+
+		const u32 dim = regs[ioBase + 1];
+		const u32 config = regs[ioBase + 2];
+		const u32 height = dim & 0x7ff;
+		const u32 width = Helpers::getBits<16, 11>(dim);
+		const u32 addr = (regs[ioBase + 4] & 0x0FFFFFFF) << 3;
+		u32 format = regs[ioBase + (i == 0 ? 13 : 5)] & 0xF;
+
+		if (addr != 0) [[likely]] {
+			Metal::Texture targetTex(device, addr, static_cast<PICA::TextureFmt>(format), width, height, config);
+			auto tex = getTexture(targetTex);
+			encoder->setFragmentTexture(tex.texture, i);
+			encoder->setFragmentSamplerState(tex.sampler ? tex.sampler : nearestSampler, i);
+		} else {
+			// TODO: bind a dummy texture?
+		}
+	}
+
+	// LUT texture
+	encoder->setFragmentTexture(lutTexture, 3);
+	encoder->setFragmentSamplerState(linearSampler, 3);
+}
+
+void RendererMTL::updateLightingLUT(MTL::RenderCommandEncoder* encoder) {
+	gpu.lightingLUTDirty = false;
+	std::array<float, GPU::LightingLutSize * 2> lightingLut = {0.0f};
+
+	for (int i = 0; i < gpu.lightingLUT.size(); i += 2) {
+    	uint64_t value = gpu.lightingLUT[i >> 1] & 0xFFF;
+    	lightingLut[i] = (float)(value << 4) / 65535.0f;
+	}
+
+	//for (int i = 0; i < Lights::LUT_Count; i++) {
+	//    lutTexture->replaceRegion(MTL::Region(0, 0, LIGHT_LUT_TEXTURE_WIDTH, 1), 0, i, u16_lightinglut.data() + LIGHT_LUT_TEXTURE_WIDTH * i, 0, 0);
+	//}
+
+	renderCommandEncoder->setRenderPipelineState(copyToLutTexturePipeline);
+	renderCommandEncoder->setDepthStencilState(defaultDepthStencilState);
+	renderCommandEncoder->setVertexTexture(lutTexture, 0);
+	Metal::BufferHandle buffer = vertexBufferCache.get(lightingLut.data(), sizeof(lightingLut));
+	renderCommandEncoder->setVertexBuffer(buffer.buffer, buffer.offset, 0);
+	u32 arrayOffset = 0;
+	renderCommandEncoder->setVertexBytes(&arrayOffset, sizeof(u32), 1);
+
+	renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), GPU::LightingLutSize);
+}
+
+void RendererMTL::updateFogLUT(MTL::RenderCommandEncoder* encoder) {
+	gpu.fogLUTDirty = false;
+	std::array<float, 128 * 2> fogLut = {0.0f};
+
+	for (int i = 0; i < fogLut.size(); i += 2) {
+		const uint32_t value = gpu.fogLUT[i >> 1];
+		int32_t diff = value & 0x1fff;
+		diff = (diff << 19) >> 19;  // Sign extend the 13-bit value to 32 bits
+		const float fogDifference = float(diff) / 2048.0f;
+		const float fogValue = float((value >> 13) & 0x7ff) / 2048.0f;
+
+		fogLut[i] = fogValue;
+		fogLut[i + 1] = fogDifference;
+	}
+
+	renderCommandEncoder->setRenderPipelineState(copyToLutTexturePipeline);
+	renderCommandEncoder->setDepthStencilState(defaultDepthStencilState);
+	renderCommandEncoder->setVertexTexture(lutTexture, 0);
+	//Metal::BufferHandle buffer = vertexBufferCache.get(fogLut.data(), sizeof(fogLut));
+	//renderCommandEncoder->setVertexBuffer(buffer.buffer, buffer.offset, 0);
+	renderCommandEncoder->setVertexBytes(fogLut.data(), sizeof(fogLut), 0);
+	u32 arrayOffset = (u32)Lights::LUT_Count;
+	renderCommandEncoder->setVertexBytes(&arrayOffset, sizeof(u32), 1);
+
+	renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), NS::UInteger(128));
+}
+
+void RendererMTL::textureCopyImpl(Metal::ColorRenderTarget& srcFramebuffer, Metal::ColorRenderTarget& destFramebuffer, const Math::Rect<u32>& srcRect, const Math::Rect<u32>& destRect) {
+    nextRenderPassName = "Texture copy";
+	MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init();
+	// TODO: clearColor sets the load action to load if it didn't find any clear, but that is unnecessary if we are doing a copy to the whole texture
+	bool doesClear = clearColor(renderPassDescriptor, destFramebuffer.texture);
+	beginRenderPassIfNeeded(renderPassDescriptor, doesClear, destFramebuffer.texture);
+
+	// Pipeline
+	Metal::BlitPipelineHash hash{destFramebuffer.format, DepthFmt::Unknown1};
+	auto blitPipeline = blitPipelineCache.get(hash);
+
+	renderCommandEncoder->setRenderPipelineState(blitPipeline);
+
+	// Viewport
+	renderCommandEncoder->setViewport(MTL::Viewport{double(destRect.left), double(destRect.bottom), double(destRect.right - destRect.left), double(destRect.top - destRect.bottom), 0.0, 1.0});
+	float srcRectNDC[4] = {srcRect.left / (float)srcFramebuffer.size.u(), srcRect.bottom / (float)srcFramebuffer.size.v(), (srcRect.right - srcRect.left) / (float)srcFramebuffer.size.u(), (srcRect.top - srcRect.bottom) / (float)srcFramebuffer.size.v()};
+
+	// Bind resources
+	renderCommandEncoder->setVertexBytes(&srcRectNDC, sizeof(srcRectNDC), 0);
+	renderCommandEncoder->setFragmentTexture(srcFramebuffer.texture, 0);
+	renderCommandEncoder->setFragmentSamplerState(nearestSampler, 0);
+
+	renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangleStrip, NS::UInteger(0), NS::UInteger(4));
+}
diff --git a/src/host_shaders/metal_copy_to_lut_texture.metal b/src/host_shaders/metal_copy_to_lut_texture.metal
new file mode 100644
index 00000000..40a7f50d
--- /dev/null
+++ b/src/host_shaders/metal_copy_to_lut_texture.metal
@@ -0,0 +1,9 @@
+#include <metal_stdlib>
+using namespace metal;
+
+constant ushort lutTextureWidth [[function_constant(0)]];
+
+// The copy is done in a vertex shader instead of a compute kernel, since dispatching compute would require ending the render pass
+vertex void vertexCopyToLutTexture(uint vid [[vertex_id]], texture2d<float, access::write> out [[texture(0)]], constant float2* data [[buffer(0)]], constant uint& arrayOffset [[buffer(1)]]) {
+    out.write(float4(data[vid], 0.0, 0.0), uint2(vid % lutTextureWidth, arrayOffset + vid / lutTextureWidth));
+}
diff --git a/src/host_shaders/metal_shaders.metal b/src/host_shaders/metal_shaders.metal
new file mode 100644
index 00000000..95f417c7
--- /dev/null
+++ b/src/host_shaders/metal_shaders.metal
@@ -0,0 +1,782 @@
+#include <metal_stdlib>
+using namespace metal;
+
+struct BasicVertexOut {
+	float4 position [[position]];
+	float2 uv;
+};
+
+constant float4 displayPositions[4] = {
+    float4(-1.0, -1.0, 0.0, 1.0),
+    float4( 1.0, -1.0, 0.0, 1.0),
+    float4(-1.0,  1.0, 0.0, 1.0),
+    float4( 1.0,  1.0, 0.0, 1.0)
+};
+
+constant float2 displayTexCoord[4] = {
+    float2(0.0, 1.0),
+    float2(0.0, 0.0),
+    float2(1.0, 1.0),
+    float2(1.0, 0.0)
+};
+
+vertex BasicVertexOut vertexDisplay(uint vid [[vertex_id]]) {
+	BasicVertexOut out;
+	out.position = displayPositions[vid];
+	out.uv = displayTexCoord[vid];
+
+	return out;
+}
+
+fragment float4 fragmentDisplay(BasicVertexOut in [[stage_in]], texture2d<float> tex [[texture(0)]], sampler samplr [[sampler(0)]]) {
+	return tex.sample(samplr, in.uv);
+}
+
+struct NDCViewport {
+    float2 offset;
+    float2 scale;
+};
+
+vertex BasicVertexOut vertexBlit(uint vid [[vertex_id]], constant NDCViewport& viewport [[buffer(0)]]) {
+	BasicVertexOut out;
+	out.uv = float2((vid << 1) & 2, vid & 2);
+	out.position = float4(out.uv * 2.0 - 1.0, 0.0, 1.0);
+	out.position.y = -out.position.y;
+	out.uv = out.uv * viewport.scale + viewport.offset;
+
+	return out;
+}
+
+fragment float4 fragmentBlit(BasicVertexOut in [[stage_in]], texture2d<float> tex [[texture(0)]], sampler samplr [[sampler(0)]]) {
+	return tex.sample(samplr, in.uv);
+}
+
+struct PicaRegs {
+    uint regs[0x200 - 0x48];
+
+    uint read(uint reg) constant {
+        return regs[reg - 0x48];
+    }
+};
+
+struct VertTEV {
+    uint textureEnvColor[6];
+};
+
+float4 abgr8888ToFloat4(uint abgr) {
+	const float scale = 1.0 / 255.0;
+
+	return scale * float4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24));
+}
+
+struct DrawVertexIn {
+	float4 position [[attribute(0)]];
+	float4 quaternion [[attribute(1)]];
+	float4 color [[attribute(2)]];
+	float2 texCoord0 [[attribute(3)]];
+	float2 texCoord1 [[attribute(4)]];
+	float texCoord0W [[attribute(5)]];
+	float3 view [[attribute(6)]];
+	float2 texCoord2 [[attribute(7)]];
+};
+
+// Metal cannot return arrays from vertex functions, this is an ugly workaround
+struct EnvColor {
+    float4 c0;
+    float4 c1;
+    float4 c2;
+    float4 c3;
+    float4 c4;
+    float4 c5;
+
+    thread float4& operator[](int i) {
+        switch (i) {
+            case 0: return c0;
+            case 1: return c1;
+            case 2: return c2;
+            case 3: return c3;
+            case 4: return c4;
+            case 5: return c5;
+            default: return c0;
+        }
+    }
+};
+
+float3 rotateFloat3ByQuaternion(float3 v, float4 q) {
+	float3 u = q.xyz;
+	float s = q.w;
+
+	return 2.0 * dot(u, v) * u + (s * s - dot(u, u)) * v + 2.0 * s * cross(u, v);
+}
+
+// Convert an arbitrary-width floating point literal to an f32
+float decodeFP(uint hex, uint E, uint M) {
+	uint width = M + E + 1u;
+	uint bias = 128u - (1u << (E - 1u));
+	uint exponent = (hex >> M) & ((1u << E) - 1u);
+	uint mantissa = hex & ((1u << M) - 1u);
+	uint sign = (hex >> (E + M)) << 31u;
+
+	if ((hex & ((1u << (width - 1u)) - 1u)) != 0u) {
+		if (exponent == (1u << E) - 1u)
+			exponent = 255u;
+		else
+			exponent += bias;
+		hex = sign | (mantissa << (23u - M)) | (exponent << 23u);
+	} else {
+		hex = sign;
+	}
+
+	return as_type<float>(hex);
+}
+
+struct DepthUniforms {
+    float depthScale;
+   	float depthOffset;
+   	bool depthMapEnable;
+};
+
+struct DrawVertexOut {
+	float4 position [[position]];
+	float4 quaternion;
+	float4 color;
+	float3 texCoord0;
+	float2 texCoord1;
+	float2 texCoord2;
+	float3 view;
+	float3 normal;
+	float3 tangent;
+	float3 bitangent;
+	EnvColor textureEnvColor [[flat]];
+	float4 textureEnvBufferColor [[flat]];
+};
+
+struct DrawVertexOutWithClip {
+    DrawVertexOut out;
+    float clipDistance [[clip_distance]] [2];
+};
+
+// TODO: check this
+float transformZ(float z, float w, constant DepthUniforms& depthUniforms) {
+    z = z / w * depthUniforms.depthScale + depthUniforms.depthOffset;
+    if (!depthUniforms.depthMapEnable) {
+        z *= w;
+    }
+
+    return z * w;
+}
+
+vertex DrawVertexOutWithClip vertexDraw(DrawVertexIn in [[stage_in]], constant PicaRegs& picaRegs [[buffer(0)]], constant VertTEV& tev [[buffer(1)]], constant DepthUniforms& depthUniforms [[buffer(2)]]) {
+	DrawVertexOut out;
+
+	// Position
+	out.position = in.position;
+	// Flip the y position
+	out.position.y = -out.position.y;
+
+	// Apply depth uniforms
+	out.position.z = transformZ(out.position.z, out.position.w, depthUniforms);
+
+	// Color
+	out.color = min(abs(in.color), 1.0);
+
+	// Texture coordinates
+	out.texCoord0 = float3(in.texCoord0, in.texCoord0W);
+	out.texCoord0.y = 1.0 - out.texCoord0.y;
+	out.texCoord1 = in.texCoord1;
+	out.texCoord1.y = 1.0 - out.texCoord1.y;
+	out.texCoord2 = in.texCoord2;
+	out.texCoord2.y = 1.0 - out.texCoord2.y;
+
+	// View
+	out.view = in.view;
+
+	// TBN
+	out.normal = normalize(rotateFloat3ByQuaternion(float3(0.0, 0.0, 1.0), in.quaternion));
+	out.tangent = normalize(rotateFloat3ByQuaternion(float3(1.0, 0.0, 0.0), in.quaternion));
+	out.bitangent = normalize(rotateFloat3ByQuaternion(float3(0.0, 1.0, 0.0), in.quaternion));
+	out.quaternion = in.quaternion;
+
+	// Environment
+	for (int i = 0; i < 6; i++) {
+		out.textureEnvColor[i] = abgr8888ToFloat4(tev.textureEnvColor[i]);
+	}
+
+	out.textureEnvBufferColor = abgr8888ToFloat4(picaRegs.read(0xFDu));
+
+	DrawVertexOutWithClip outWithClip;
+	outWithClip.out = out;
+
+	// Parse clipping plane registers
+	float4 clipData = float4(
+		decodeFP(picaRegs.read(0x48u) & 0xffffffu, 7u, 16u), decodeFP(picaRegs.read(0x49u) & 0xffffffu, 7u, 16u),
+		decodeFP(picaRegs.read(0x4Au) & 0xffffffu, 7u, 16u), decodeFP(picaRegs.read(0x4Bu) & 0xffffffu, 7u, 16u)
+	);
+
+	// There's also another, always-on clipping plane based on vertex z
+	// TODO: transform
+	outWithClip.clipDistance[0] = -in.position.z;
+	outWithClip.clipDistance[1] = dot(clipData, in.position);
+
+	return outWithClip;
+}
+
+constant bool lightingEnabled [[function_constant(0)]];
+constant uint8_t lightingNumLights [[function_constant(1)]];
+constant uint32_t lightingConfig1 [[function_constant(2)]];
+constant uint16_t alphaControl [[function_constant(3)]];
+
+struct Globals {
+    bool error_unimpl;
+
+    float4 tevSources[16];
+    float4 tevNextPreviousBuffer;
+    bool tevUnimplementedSourceFlag = false;
+
+    uint GPUREG_LIGHTING_LUTINPUT_SCALE;
+	uint GPUREG_LIGHTING_LUTINPUT_ABS;
+	uint GPUREG_LIGHTING_LUTINPUT_SELECT;
+	uint GPUREG_LIGHTi_CONFIG;
+
+	// HACK
+	//bool lightingEnabled;
+    //uint8_t lightingNumLights;
+    //uint32_t lightingConfig1;
+    //uint16_t alphaControl;
+
+    float3 normal;
+};
+
+// See docs/lighting.md
+constant uint samplerEnabledBitfields[2] = {0x7170e645u, 0x7f013fefu};
+
+bool isSamplerEnabled(uint environment_id, uint lut_id) {
+	uint index = 7 * environment_id + lut_id;
+	uint arrayIndex = (index >> 5);
+	return (samplerEnabledBitfields[arrayIndex] & (1u << (index & 31u))) != 0u;
+}
+
+struct FragTEV {
+    uint textureEnvSource[6];
+    uint textureEnvOperand[6];
+    uint textureEnvCombiner[6];
+    uint textureEnvScale[6];
+
+    float4 fetchSource(thread Globals& globals, uint src_id) constant {
+    	if (src_id >= 6u && src_id < 13u) {
+    		globals.tevUnimplementedSourceFlag = true;
+    	}
+
+    	return globals.tevSources[src_id];
+    }
+
+    float4 getColorAndAlphaSource(thread Globals& globals, int tev_id, int src_id) constant {
+    	float4 result;
+
+    	float4 colorSource = fetchSource(globals, (textureEnvSource[tev_id] >> (src_id * 4)) & 15u);
+    	float4 alphaSource = fetchSource(globals, (textureEnvSource[tev_id] >> (src_id * 4 + 16)) & 15u);
+
+    	uint colorOperand = (textureEnvOperand[tev_id] >> (src_id * 4)) & 15u;
+    	uint alphaOperand = (textureEnvOperand[tev_id] >> (12 + src_id * 4)) & 7u;
+
+    	// TODO: figure out what the undocumented values do
+    	switch (colorOperand) {
+    		case 0u: result.rgb = colorSource.rgb; break;             // Source color
+    		case 1u: result.rgb = 1.0 - colorSource.rgb; break;       // One minus source color
+    		case 2u: result.rgb = float3(colorSource.a); break;         // Source alpha
+    		case 3u: result.rgb = float3(1.0 - colorSource.a); break;   // One minus source alpha
+    		case 4u: result.rgb = float3(colorSource.r); break;         // Source red
+    		case 5u: result.rgb = float3(1.0 - colorSource.r); break;   // One minus source red
+    		case 8u: result.rgb = float3(colorSource.g); break;         // Source green
+    		case 9u: result.rgb = float3(1.0 - colorSource.g); break;   // One minus source green
+    		case 12u: result.rgb = float3(colorSource.b); break;        // Source blue
+    		case 13u: result.rgb = float3(1.0 - colorSource.b); break;  // One minus source blue
+    		default: break;
+    	}
+
+    	// TODO: figure out what the undocumented values do
+    	switch (alphaOperand) {
+    		case 0u: result.a = alphaSource.a; break;        // Source alpha
+    		case 1u: result.a = 1.0 - alphaSource.a; break;  // One minus source alpha
+    		case 2u: result.a = alphaSource.r; break;        // Source red
+    		case 3u: result.a = 1.0 - alphaSource.r; break;  // One minus source red
+    		case 4u: result.a = alphaSource.g; break;        // Source green
+    		case 5u: result.a = 1.0 - alphaSource.g; break;  // One minus source green
+    		case 6u: result.a = alphaSource.b; break;        // Source blue
+    		case 7u: result.a = 1.0 - alphaSource.b; break;  // One minus source blue
+    		default: break;
+    	}
+
+    	return result;
+    }
+
+    float4 calculateCombiner(thread Globals& globals, int tev_id) constant {
+    	float4 source0 = getColorAndAlphaSource(globals, tev_id, 0);
+    	float4 source1 = getColorAndAlphaSource(globals, tev_id, 1);
+    	float4 source2 = getColorAndAlphaSource(globals, tev_id, 2);
+
+    	uint colorCombine = textureEnvCombiner[tev_id] & 15u;
+    	uint alphaCombine = (textureEnvCombiner[tev_id] >> 16) & 15u;
+
+    	float4 result = float4(1.0);
+
+    	// TODO: figure out what the undocumented values do
+    	switch (colorCombine) {
+    		case 0u: result.rgb = source0.rgb; break;                                            // Replace
+    		case 1u: result.rgb = source0.rgb * source1.rgb; break;                              // Modulate
+    		case 2u: result.rgb = min(float3(1.0), source0.rgb + source1.rgb); break;              // Add
+    		case 3u: result.rgb = clamp(source0.rgb + source1.rgb - 0.5, 0.0, 1.0); break;       // Add signed
+    		case 4u: result.rgb = mix(source1.rgb, source0.rgb, source2.rgb); break;             // Interpolate
+    		case 5u: result.rgb = max(source0.rgb - source1.rgb, 0.0); break;                    // Subtract
+    		case 6u: result.rgb = float3(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break;  // Dot3 RGB
+    		case 7u: result = float4(4.0 * dot(source0.rgb - 0.5, source1.rgb - 0.5)); break;      // Dot3 RGBA
+    		case 8u: result.rgb = min(source0.rgb * source1.rgb + source2.rgb, 1.0); break;      // Multiply then add
+    		case 9u: result.rgb = min((source0.rgb + source1.rgb), 1.0) * source2.rgb; break;    // Add then multiply
+    		default: break;
+    	}
+
+    	if (colorCombine != 7u) {  // The color combiner also writes the alpha channel in the "Dot3 RGBA" mode.
+    		// TODO: figure out what the undocumented values do
+    		// TODO: test if the alpha combiner supports all the same modes as the color combiner.
+    		switch (alphaCombine) {
+    			case 0u: result.a = source0.a; break;                                      // Replace
+    			case 1u: result.a = source0.a * source1.a; break;                          // Modulate
+    			case 2u: result.a = min(1.0, source0.a + source1.a); break;                // Add
+    			case 3u: result.a = clamp(source0.a + source1.a - 0.5, 0.0, 1.0); break;   // Add signed
+    			case 4u: result.a = mix(source1.a, source0.a, source2.a); break;           // Interpolate
+    			case 5u: result.a = max(0.0, source0.a - source1.a); break;                // Subtract
+    			case 8u: result.a = min(source0.a * source1.a + source2.a, 1.0); break;    // Multiply then add
+    			case 9u: result.a = min(source0.a + source1.a, 1.0) * source2.a; break;  // Add then multiply
+    			default: break;
+    		}
+    	}
+
+    	result.rgb *= float(1 << (textureEnvScale[tev_id] & 3u));
+    	result.a *= float(1 << ((textureEnvScale[tev_id] >> 16) & 3u));
+
+    	return result;
+    }
+};
+
+enum class LogicOp : uint8_t {
+    Clear = 0,
+    And = 1,
+    AndReverse = 2,
+    Copy = 3,
+    Set = 4,
+    CopyInverted = 5,
+    NoOp = 6,
+    Invert = 7,
+    Nand = 8,
+    Or = 9,
+    Nor = 10,
+    Xor = 11,
+    Equiv = 12,
+    AndInverted = 13,
+    OrReverse = 14,
+    OrInverted = 15
+};
+
+uint4 performLogicOpU(LogicOp logicOp, uint4 s, uint4 d) {
+    switch (logicOp) {
+    case LogicOp::Clear: return as_type<uint4>(float4(0.0));
+    case LogicOp::And: return s & d;
+    case LogicOp::AndReverse: return s & ~d;
+    case LogicOp::Copy: return s;
+    case LogicOp::Set: return as_type<uint4>(float4(1.0));
+    case LogicOp::CopyInverted: return ~s;
+    case LogicOp::NoOp: return d;
+    case LogicOp::Invert: return ~d;
+    case LogicOp::Nand: return ~(s & d);
+    case LogicOp::Or: return s | d;
+    case LogicOp::Nor: return ~(s | d);
+    case LogicOp::Xor: return s ^ d;
+    case LogicOp::Equiv: return ~(s ^ d);
+    case LogicOp::AndInverted: return ~s & d;
+    case LogicOp::OrReverse: return s | ~d;
+    case LogicOp::OrInverted: return ~s | d;
+    }
+}
+
+#define D0_LUT 0u
+#define D1_LUT 1u
+#define SP_LUT 2u
+#define FR_LUT 3u
+#define RB_LUT 4u
+#define RG_LUT 5u
+#define RR_LUT 6u
+
+#define FOG_INDEX 24
+
+float lutLookup(texture2d<float> texLut, uint lut, uint index) {
+	return texLut.read(uint2(index, lut)).r;
+}
+
+float lightLutLookup(thread Globals& globals, thread DrawVertexOut& in, constant PicaRegs& picaRegs, texture2d<float> texLut, uint environment_id, uint lut_id, uint light_id, float3 light_vector, float3 half_vector) {
+	uint lut_index;
+	int bit_in_config1;
+	if (lut_id == SP_LUT) {
+		// These are the spotlight attenuation LUTs
+		bit_in_config1 = 8 + int(light_id & 7u);
+		lut_index = 8u + light_id;
+	} else if (lut_id <= 6) {
+		bit_in_config1 = 16 + int(lut_id);
+		lut_index = lut_id;
+	} else {
+		globals.error_unimpl = true;
+	}
+
+	bool current_sampler_enabled = isSamplerEnabled(environment_id, lut_id); // 7 luts per environment
+
+	if (!current_sampler_enabled || (extract_bits(lightingConfig1, bit_in_config1, 1) != 0u)) {
+		return 1.0;
+	}
+
+	uint scale_id = extract_bits(globals.GPUREG_LIGHTING_LUTINPUT_SCALE, int(lut_id) << 2, 3);
+	float scale = float(1u << scale_id);
+	if (scale_id >= 6u) scale /= 256.0;
+
+	float delta = 1.0;
+	uint input_id = extract_bits(globals.GPUREG_LIGHTING_LUTINPUT_SELECT, int(lut_id) << 2, 3);
+	switch (input_id) {
+		case 0u: {
+			delta = dot(globals.normal, normalize(half_vector));
+			break;
+		}
+		case 1u: {
+			delta = dot(normalize(in.view), normalize(half_vector));
+			break;
+		}
+		case 2u: {
+			delta = dot(globals.normal, normalize(in.view));
+			break;
+		}
+		case 3u: {
+			delta = dot(light_vector, globals.normal);
+			break;
+		}
+		case 4u: {
+			int GPUREG_LIGHTi_SPOTDIR_LOW = int(picaRegs.read(0x0146u + (light_id << 4u)));
+			int GPUREG_LIGHTi_SPOTDIR_HIGH = int(picaRegs.read(0x0147u + (light_id << 4u)));
+
+			// Sign extend them. Normally bitfieldExtract would do that but it's missing on some versions
+			// of GLSL so we do it manually
+			int se_x = extract_bits(GPUREG_LIGHTi_SPOTDIR_LOW, 0, 13);
+			int se_y = extract_bits(GPUREG_LIGHTi_SPOTDIR_LOW, 16, 13);
+			int se_z = extract_bits(GPUREG_LIGHTi_SPOTDIR_HIGH, 0, 13);
+
+			if ((se_x & 0x1000) == 0x1000) se_x |= 0xffffe000;
+			if ((se_y & 0x1000) == 0x1000) se_y |= 0xffffe000;
+			if ((se_z & 0x1000) == 0x1000) se_z |= 0xffffe000;
+
+			// These are fixed point 1.1.11 values, so we need to convert them to float
+			float x = float(se_x) / 2047.0;
+			float y = float(se_y) / 2047.0;
+			float z = float(se_z) / 2047.0;
+			float3 spotlight_vector = float3(x, y, z);
+			delta = dot(light_vector, spotlight_vector); // spotlight direction is negated so we don't negate light_vector
+			break;
+		}
+		case 5u: {
+			delta = 1.0;  // TODO: cos <greek symbol> (aka CP);
+			globals.error_unimpl = true;
+			break;
+		}
+		default: {
+			delta = 1.0;
+			globals.error_unimpl = true;
+			break;
+		}
+	}
+
+	// 0 = enabled
+	if (extract_bits(globals.GPUREG_LIGHTING_LUTINPUT_ABS, 1 + (int(lut_id) << 2), 1) == 0u) {
+		// Two sided diffuse
+		if (extract_bits(globals.GPUREG_LIGHTi_CONFIG, 1, 1) == 0u) {
+			delta = max(delta, 0.0);
+		} else {
+			delta = abs(delta);
+		}
+		int index = int(clamp(floor(delta * 255.0), 0.f, 255.f));
+		return lutLookup(texLut, lut_index, index) * scale;
+	} else {
+		// Range is [-1, 1] so we need to map it to [0, 1]
+		int index = int(clamp(floor(delta * 128.0), -128.f, 127.f));
+		if (index < 0) index += 256;
+		return lutLookup(texLut, lut_index, index) * scale;
+	}
+}
+
+float3 regToColor(uint reg) {
+	// Normalization scale to convert from [0...255] to [0.0...1.0]
+	const float scale = 1.0 / 255.0;
+
+	return scale * float3(float(extract_bits(reg, 20, 8)), float(extract_bits(reg, 10, 8)), float(extract_bits(reg, 00, 8)));
+}
+
+// Implements the following algorthm: https://mathb.in/26766
+void calcLighting(thread Globals& globals, thread DrawVertexOut& in, constant PicaRegs& picaRegs, texture2d<float> texLut, sampler linearSampler, thread float4& primaryColor, thread float4& secondaryColor) {
+	// Quaternions describe a transformation from surface-local space to eye space.
+	// In surface-local space, by definition (and up to permutation) the normal vector is (0,0,1),
+	// the tangent vector is (1,0,0), and the bitangent vector is (0,1,0).
+	//float3 normal = normalize(in.normal);
+	//float3 tangent = normalize(in.tangent);
+	//float3 bitangent = normalize(in.bitangent);
+	//float3 view = normalize(in.view);
+
+	uint GPUREG_LIGHTING_LIGHT_PERMUTATION = picaRegs.read(0x01D9u);
+
+	primaryColor = float4(0.0, 0.0, 0.0, 1.0);
+	secondaryColor = float4(0.0, 0.0, 0.0, 1.0);
+
+	uint GPUREG_LIGHTING_CONFIG0 = picaRegs.read(0x01C3u);
+	globals.GPUREG_LIGHTING_LUTINPUT_SCALE = picaRegs.read(0x01D2u);
+	globals.GPUREG_LIGHTING_LUTINPUT_ABS = picaRegs.read(0x01D0u);
+	globals.GPUREG_LIGHTING_LUTINPUT_SELECT = picaRegs.read(0x01D1u);
+
+	uint bumpMode = extract_bits(GPUREG_LIGHTING_CONFIG0, 28, 2);
+
+	// Bump mode is ignored for now because it breaks some games ie. Toad Treasure Tracker
+	switch (bumpMode) {
+		default: {
+			globals.normal = rotateFloat3ByQuaternion(float3(0.0, 0.0, 1.0), in.quaternion);
+			break;
+		}
+	}
+
+	float4 diffuseSum = float4(0.0, 0.0, 0.0, 1.0);
+	float4 specularSum = float4(0.0, 0.0, 0.0, 1.0);
+
+	uint environmentId = extract_bits(GPUREG_LIGHTING_CONFIG0, 4, 4);
+	bool clampHighlights = extract_bits(GPUREG_LIGHTING_CONFIG0, 27, 1) == 1u;
+
+	uint lightId;
+	float3 lightVector = float3(0.0);
+	float3 halfVector = float3(0.0);
+
+	for (uint i = 0u; i < lightingNumLights + 1; i++) {
+		lightId = extract_bits(GPUREG_LIGHTING_LIGHT_PERMUTATION, int(i) << 2, 3);
+
+		uint GPUREG_LIGHTi_SPECULAR0 = picaRegs.read(0x0140u + (lightId << 4u));
+		uint GPUREG_LIGHTi_SPECULAR1 = picaRegs.read(0x0141u + (lightId << 4u));
+		uint GPUREG_LIGHTi_DIFFUSE = picaRegs.read(0x0142u + (lightId << 4u));
+		uint GPUREG_LIGHTi_AMBIENT = picaRegs.read(0x0143u + (lightId << 4u));
+		uint GPUREG_LIGHTi_VECTOR_LOW = picaRegs.read(0x0144u + (lightId << 4u));
+		uint GPUREG_LIGHTi_VECTOR_HIGH = picaRegs.read(0x0145u + (lightId << 4u));
+		globals.GPUREG_LIGHTi_CONFIG = picaRegs.read(0x0149u + (lightId << 4u));
+
+		float lightDistance;
+		float3 lightPosition = normalize(float3(
+			decodeFP(extract_bits(GPUREG_LIGHTi_VECTOR_LOW, 0, 16), 5u, 10u), decodeFP(extract_bits(GPUREG_LIGHTi_VECTOR_LOW, 16, 16), 5u, 10u),
+			decodeFP(extract_bits(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5u, 10u)
+		));
+
+		// Positional Light
+		if (extract_bits(globals.GPUREG_LIGHTi_CONFIG, 0, 1) == 0u) {
+			// error_unimpl = true;
+			lightVector = lightPosition + in.view;
+		}
+
+		// Directional light
+		else {
+			lightVector = lightPosition;
+		}
+
+		lightDistance = length(lightVector);
+		lightVector = normalize(lightVector);
+		halfVector = lightVector + normalize(in.view);
+
+		float NdotL = dot(globals.normal, lightVector);  // N dot Li
+
+		// Two sided diffuse
+		if (extract_bits(globals.GPUREG_LIGHTi_CONFIG, 1, 1) == 0u)
+			NdotL = max(0.0, NdotL);
+		else
+			NdotL = abs(NdotL);
+
+		float geometricFactor;
+		bool useGeo0 = extract_bits(globals.GPUREG_LIGHTi_CONFIG, 2, 1) == 1u;
+		bool useGeo1 = extract_bits(globals.GPUREG_LIGHTi_CONFIG, 3, 1) == 1u;
+		if (useGeo0 || useGeo1) {
+			geometricFactor = dot(halfVector, halfVector);
+			geometricFactor = geometricFactor == 0.0 ? 0.0 : min(NdotL / geometricFactor, 1.0);
+		}
+
+		float distanceAttenuation = 1.0;
+		if (extract_bits(lightingConfig1, 24 + int(lightId), 1) == 0u) {
+			uint GPUREG_LIGHTi_ATTENUATION_BIAS = extract_bits(picaRegs.read(0x014Au + (lightId << 4u)), 0, 20);
+			uint GPUREG_LIGHTi_ATTENUATION_SCALE = extract_bits(picaRegs.read(0x014Bu + (lightId << 4u)), 0, 20);
+
+			float distanceAttenuationBias = decodeFP(GPUREG_LIGHTi_ATTENUATION_BIAS, 7u, 12u);
+			float distanceAttenuationScale = decodeFP(GPUREG_LIGHTi_ATTENUATION_SCALE, 7u, 12u);
+
+			float delta = lightDistance * distanceAttenuationScale + distanceAttenuationBias;
+			delta = clamp(delta, 0.0, 1.0);
+			int index = int(clamp(floor(delta * 255.0), 0.0, 255.0));
+			distanceAttenuation = lutLookup(texLut, 16u + lightId, index);
+		}
+
+		float spotlightAttenuation = lightLutLookup(globals, in, picaRegs, texLut, environmentId, SP_LUT, lightId, lightVector, halfVector);
+		float specular0Distribution = lightLutLookup(globals, in, picaRegs, texLut, environmentId, D0_LUT, lightId, lightVector, halfVector);
+		float specular1Distribution = lightLutLookup(globals, in, picaRegs, texLut, environmentId, D1_LUT, lightId, lightVector, halfVector);
+		float3 reflectedColor;
+		reflectedColor.r = lightLutLookup(globals, in, picaRegs, texLut, environmentId, RR_LUT, lightId, lightVector, halfVector);
+
+		if (isSamplerEnabled(environmentId, RG_LUT)) {
+			reflectedColor.g = lightLutLookup(globals, in, picaRegs, texLut, environmentId, RG_LUT, lightId, lightVector, halfVector);
+		} else {
+			reflectedColor.g = reflectedColor.r;
+		}
+
+		if (isSamplerEnabled(environmentId, RB_LUT)) {
+			reflectedColor.b = lightLutLookup(globals, in, picaRegs, texLut, environmentId, RB_LUT, lightId, lightVector, halfVector);
+		} else {
+			reflectedColor.b = reflectedColor.r;
+		}
+
+		float3 specular0 = regToColor(GPUREG_LIGHTi_SPECULAR0) * specular0Distribution;
+		float3 specular1 = regToColor(GPUREG_LIGHTi_SPECULAR1) * specular1Distribution * reflectedColor;
+
+		specular0 *= useGeo0 ? geometricFactor : 1.0;
+		specular1 *= useGeo1 ? geometricFactor : 1.0;
+
+		float clampFactor = 1.0;
+		if (clampHighlights && NdotL == 0.0) {
+			clampFactor = 0.0;
+		}
+
+		float lightFactor = distanceAttenuation * spotlightAttenuation;
+		diffuseSum.rgb += lightFactor * (regToColor(GPUREG_LIGHTi_AMBIENT) + regToColor(GPUREG_LIGHTi_DIFFUSE) * NdotL);
+		specularSum.rgb += lightFactor * clampFactor * (specular0 + specular1);
+	}
+	uint fresnelOutput1 = extract_bits(GPUREG_LIGHTING_CONFIG0, 2, 1);
+	uint fresnelOutput2 = extract_bits(GPUREG_LIGHTING_CONFIG0, 3, 1);
+
+	float fresnelFactor;
+
+	if (fresnelOutput1 == 1u || fresnelOutput2 == 1u) {
+		fresnelFactor = lightLutLookup(globals, in, picaRegs, texLut, environmentId, FR_LUT, lightId, lightVector, halfVector);
+	}
+
+	if (fresnelOutput1 == 1u) {
+		diffuseSum.a = fresnelFactor;
+	}
+
+	if (fresnelOutput2 == 1u) {
+		specularSum.a = fresnelFactor;
+	}
+
+	uint GPUREG_LIGHTING_AMBIENT = picaRegs.read(0x01C0u);
+	float4 globalAmbient = float4(regToColor(GPUREG_LIGHTING_AMBIENT), 1.0);
+	primaryColor = clamp(globalAmbient + diffuseSum, 0.0, 1.0);
+	secondaryColor = clamp(specularSum, 0.0, 1.0);
+}
+
+float4 performLogicOp(LogicOp logicOp, float4 s, float4 d) {
+    return as_type<float4>(performLogicOpU(logicOp, as_type<uint4>(s), as_type<uint4>(d)));
+}
+
+fragment float4 fragmentDraw(DrawVertexOut in [[stage_in]], float4 prevColor [[color(0)]], constant PicaRegs& picaRegs [[buffer(0)]], constant FragTEV& tev [[buffer(1)]], constant LogicOp& logicOp [[buffer(2)]],
+                             texture2d<float> tex0 [[texture(0)]], texture2d<float> tex1 [[texture(1)]], texture2d<float> tex2 [[texture(2)]], texture2d<float> texLut [[texture(3)]],
+                             sampler samplr0 [[sampler(0)]], sampler samplr1 [[sampler(1)]], sampler samplr2 [[sampler(2)]], sampler linearSampler [[sampler(3)]]) {
+    Globals globals;
+
+    // HACK
+    //globals.lightingEnabled = picaRegs.read(0x008Fu) != 0u;
+    //globals.lightingNumLights = picaRegs.read(0x01C2u);
+    //globals.lightingConfig1 = picaRegs.read(0x01C4u);
+    //globals.alphaControl = picaRegs.read(0x104);
+
+    globals.tevSources[0] = in.color;
+    if (lightingEnabled) {
+        calcLighting(globals, in, picaRegs, texLut, linearSampler, globals.tevSources[1], globals.tevSources[2]);
+    } else {
+        globals.tevSources[1] = float4(0.0);
+        globals.tevSources[2] = float4(0.0);
+    }
+
+	uint textureConfig = picaRegs.read(0x80u);
+	float2 texCoord2 = (textureConfig & (1u << 13)) != 0u ? in.texCoord1 : in.texCoord2;
+
+	if ((textureConfig & 1u) != 0u) globals.tevSources[3] = tex0.sample(samplr0, in.texCoord0.xy);
+	if ((textureConfig & 2u) != 0u) globals.tevSources[4] = tex1.sample(samplr1, in.texCoord1);
+	if ((textureConfig & 4u) != 0u) globals.tevSources[5] = tex2.sample(samplr2, texCoord2);
+	globals.tevSources[13] = float4(0.0);  // Previous buffer
+	globals.tevSources[15] = in.color;     // Previous combiner
+
+	globals.tevNextPreviousBuffer = in.textureEnvBufferColor;
+	uint textureEnvUpdateBuffer = picaRegs.read(0xE0u);
+
+	for (int i = 0; i < 6; i++) {
+		globals.tevSources[14] = in.textureEnvColor[i];  // Constant color
+		globals.tevSources[15] = tev.calculateCombiner(globals, i);
+		globals.tevSources[13] = globals.tevNextPreviousBuffer;
+
+		if (i < 4) {
+			if ((textureEnvUpdateBuffer & (0x100u << i)) != 0u) {
+				globals.tevNextPreviousBuffer.rgb = globals.tevSources[15].rgb;
+			}
+
+			if ((textureEnvUpdateBuffer & (0x1000u << i)) != 0u) {
+				globals.tevNextPreviousBuffer.a = globals.tevSources[15].a;
+			}
+		}
+	}
+
+	float4 color = globals.tevSources[15];
+
+	// Fog
+	bool enable_fog = (textureEnvUpdateBuffer & 7u) == 5u;
+
+	if (enable_fog) {
+		bool flip_depth = (textureEnvUpdateBuffer & (1u << 16)) != 0u;
+		float fog_index = flip_depth ? 1.0 - in.position.z : in.position.z;
+		fog_index *= 128.0;
+		float clamped_index = clamp(floor(fog_index), 0.0, 127.0);
+		float delta = fog_index - clamped_index;
+		float2 value = texLut.read(uint2(clamped_index, FOG_INDEX)).rg;
+		float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);
+
+		uint GPUREG_FOG_COLOR = picaRegs.read(0x00E1u);
+
+		// Annoyingly color is not encoded in the same way as light color
+		float r = (GPUREG_FOG_COLOR & 0xFFu) / 255.0;
+		float g = ((GPUREG_FOG_COLOR >> 8) & 0xFFu) / 255.0;
+		float b = ((GPUREG_FOG_COLOR >> 16) & 0xFFu) / 255.0;
+		float3 fog_color = float3(r, g, b);
+
+		color.rgb = mix(fog_color, color.rgb, fog_factor);
+	}
+
+	// Perform alpha test
+	if ((alphaControl & 1u) != 0u) {  // Check if alpha test is on
+		uint func = (alphaControl >> 4u) & 7u;
+		float reference = float((alphaControl >> 8u) & 0xffu) / 255.0;
+		float alpha = color.a;
+
+		switch (func) {
+			case 0u: discard_fragment();  // Never pass alpha test
+			case 1u: break;    // Always pass alpha test
+			case 2u:           // Pass if equal
+				if (alpha != reference) discard_fragment();
+				break;
+			case 3u:  // Pass if not equal
+				if (alpha == reference) discard_fragment();
+				break;
+			case 4u:  // Pass if less than
+				if (alpha >= reference) discard_fragment();
+				break;
+			case 5u:  // Pass if less than or equal
+				if (alpha > reference) discard_fragment();
+				break;
+			case 6u:  // Pass if greater than
+				if (alpha <= reference) discard_fragment();
+				break;
+			case 7u:  // Pass if greater than or equal
+				if (alpha < reference) discard_fragment();
+				break;
+		}
+	}
+
+	return performLogicOp(logicOp, color, prevColor);
+}