diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
index 99517bd1..5c3c403c 100644
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@@ -5,6 +5,7 @@
 #include "helpers.hpp"
 #include "PICA/shader.hpp"
 #include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
 #include "x64_regs.hpp"
 
 #include <vector>
@@ -14,13 +15,20 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
 	static constexpr size_t allocSize = executableMemorySize + 0x1000;
 
+	// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
+	static constexpr uint noSwizzle = 0x1B;
+
+	using f24 = Floats::f24;
+	using vec4f = OpenGL::Vector<f24, 4>;
+
 	// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
 	std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
 	// A vector of PCs that can potentially return based on the state of the PICA callstack.
 	// Filled before compiling a shader by scanning the code for call instructions
 	std::vector<u32> returnPCs;
 
-	u32 recompilerPC; // PC the recompiler is currently recompiling @
+	u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
+	bool haveSSE4_1 = false;  // Shows if the CPU supports SSE4.1
 
 	// Compile all instructions from [current recompiler PC, end)
 	void compileUntil(const PICAShader& shaderUnit, u32 endPC);
@@ -35,7 +43,12 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	void scanForCalls(const PICAShader& shaderUnit);
 
 	// Load register with number "srcReg" indexed by index "idx" into the xmm register "reg"
-	void loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 idx);
+	template <int sourceIndex>
+	void loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor);
+	void storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor);
+
+	const vec4f& getSourceRef(const PICAShader& shader, u32 src);
+	const vec4f& getDestRef(const PICAShader& shader, u32 dest);
 
 	// Instruction recompilation functions
 	void recMOV(const PICAShader& shader, u32 instruction);
@@ -44,15 +57,22 @@ public:
 	using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
 	// Callback type used for the JIT prologue. This is what the caller will call
 	using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb);
-	PrologueCallback prologueCb;
+	PrologueCallback prologueCb = nullptr;
 
 	// Initialize our emitter with "allocSize" bytes of RWX memory
-	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {}
+	ShaderEmitter() : Xbyak::CodeGenerator(allocSize) {
+		const auto cpu = Xbyak::util::Cpu();
+
+		haveSSE4_1 = cpu.has(Xbyak::util::Cpu::tSSE41);
+	}
+	
 	void compile(const PICAShader& shaderUnit);
 
 	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
 	InstructionCallback getInstructionCallback(u32 pc) {
-		return reinterpret_cast<InstructionCallback>(instructionLabels.at(pc).getAddress());
+		// Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr
+		uint8_t* ptr = const_cast<uint8_t*>(instructionLabels.at(pc).getAddress());
+		return reinterpret_cast<InstructionCallback>(ptr);
 	}
 
 	PrologueCallback getPrologueCallback() {
diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index a2f830b1..284438bf 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -42,6 +42,7 @@ namespace ShaderOpcodes {
 	};
 }
 
+// Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
 class PICAShader {
 	using f24 = Floats::f24;
 	using vec4f = OpenGL::Vector<f24, 4>;
@@ -74,7 +75,7 @@ class PICAShader {
 
 protected:
 	std::array<u32, 128> operandDescriptors;
-	std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
+	alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
 	OpenGL::Vector<s32, 2> addrRegister; // Address register
 	bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
 	u32 loopCounter;
@@ -104,10 +105,10 @@ protected:
 	friend class ShaderJIT;
 	friend class ShaderEmitter;
 
-private:
 	vec4f getSource(u32 source);
 	vec4f& getDest(u32 dest);
 
+private:
 	// Interpreter functions for the various shader functions
 	void add(u32 instruction);
 	void call(u32 instruction);
@@ -193,11 +194,11 @@ public:
 	u32 entrypoint = 0; // Initial shader PC
 	u32 boolUniform;
 	std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
-	std::array<vec4f, 96> floatUniforms;
+	alignas(16) std::array<vec4f, 96> floatUniforms;
 
-	std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
-	std::array<vec4f, 16> inputs; // Attributes passed to the shader
-	std::array<vec4f, 16> outputs;
+	alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
+	alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
+	alignas(16) std::array<vec4f, 16> outputs;
 
 	PICAShader(ShaderType type) : type(type) {}
 
diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
index bc6bd916..d06b9db2 100644
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@@ -73,27 +73,117 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 	switch (opcode) {
 		case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
 		default:
-			Helpers::panic("ShaderJIT: Unimplemented PICA opcode %X", opcode);
+			Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
 	}
 }
 
-void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 srcReg, u32 index) {
+const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) {
+	alignas(16) static vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() });
 
+	if (src < 0x10)
+		return shader.inputs[src];
+	else if (src < 0x20)
+		return shader.tempRegisters[src - 0x10];
+	else if (src <= 0x7f)
+		return shader.floatUniforms[src - 0x20];
+	else {
+		Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src);
+		return dummy;
+	}
+}
+
+const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) {
+	if (dest < 0x10) {
+		return shader.outputs[dest];
+	} else if (dest < 0x20) {
+		return shader.tempRegisters[dest - 0x10];
+	}
+	Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest);
+}
+
+// See shader.hpp header for docs on how the swizzle and negate works
+template <int sourceIndex>
+void ShaderEmitter::loadRegister(Xmm dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) {
+	u32 compSwizzle; // Component swizzle pattern for the register
+	bool negate;     // If true, negate all lanes of the register
+
+	if constexpr (sourceIndex == 1) { // SRC1
+		negate = ((operandDescriptor >> 4) & 1) != 0;
+		compSwizzle = (operandDescriptor >> 5) & 0xff;
+	}
+	else if constexpr (sourceIndex == 2) { // SRC2
+		negate = ((operandDescriptor >> 13) & 1) != 0;
+		compSwizzle = (operandDescriptor >> 14) & 0xff;
+	}
+	else if constexpr (sourceIndex == 3) { // SRC3
+		negate = ((operandDescriptor >> 22) & 1) != 0;
+		compSwizzle = (operandDescriptor >> 23) & 0xff;
+	}
+
+	// PICA has the swizzle descriptor inverted in comparison to x86. For the PICA, the descriptor is (lowest to highest bits) wzyx while it's xyzw for x86
+	u32 convertedSwizzle = ((compSwizzle >> 6) & 0b11) | (((compSwizzle >> 4) & 0b11) << 2) | (((compSwizzle >> 2) & 0b11) << 4) | ((compSwizzle & 0b11) << 6);
+
+	switch (index) {
+		case 0: [[likely]] { // Keep src as is, no need to offset it
+			const vec4f& srcRef = getSourceRef(shader, src);
+			const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
+
+			if (compSwizzle == noSwizzle) // Avoid emitting swizzle if not necessary
+				movaps(dest, xword[statePointer + offset]);
+			else // Swizzle is not trivial so we need to emit a shuffle instruction
+				pshufd(dest, xword[statePointer + offset], convertedSwizzle);
+			return;
+		}
+		
+		default:
+			Helpers::panic("[ShaderJIT]: Unimplemented source index type");
+	}
+
+	if (negate) {
+		Helpers::panic("[ShaderJIT] Unimplemented register negation");
+	}
+
+	Helpers::panic("Reached unreachable path in PICAShader::getIndexedSource");
+}
+
+void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest, u32 operandDescriptor) {
+	const vec4f& destRef = getDestRef(shader, dest);
+	const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct
+
+	// Mask of which lanes to write
+	u32 writeMask = operandDescriptor & 0xf;
+	if (writeMask == 0xf) { // No lanes are masked, just movaps
+		movaps(xword[statePointer + offset], source);
+	} else if (haveSSE4_1) {
+		// Bit reverse the write mask because that is what blendps expects
+		u32 adjustedMask = ((writeMask >> 3) & 0b1) | ((writeMask >> 1) & 0b10) | ((writeMask << 1) & 0b100) | ((writeMask << 3) & 0b1000);
+		movaps(scratch1, xword[statePointer + offset]); // Read current value of dest
+		blendps(scratch1, source, adjustedMask);        // Blend with source
+		movaps(xword[statePointer + offset], scratch1); // Write back
+	} else {
+		// Blend algo referenced from Citra
+		const u8 selector = (((writeMask & 0b1000) ? 1 : 0) << 0) |
+			(((writeMask & 0b0100) ? 3 : 2) << 2) |
+			(((writeMask & 0b0010) ? 0 : 1) << 4) |
+			(((writeMask & 0b0001) ? 2 : 3) << 6);
+
+		movaps(scratch1, xword[statePointer + offset]);
+		movaps(scratch2, source);
+		unpckhps(scratch2, scratch1); // Unpack X/Y components of source and destination
+		unpcklps(scratch1, source);   // Unpack Z/W components of source and destination
+		shufps(scratch1, scratch2, selector); // "merge-shuffle" dest and source using selecto
+		movaps(xword[statePointer + offset], scratch1); // Write back
+	}
 }
 
 void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
-    /*
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	u32 src = (instruction >> 12) & 0x7f;
 	const u32 idx = (instruction >> 19) & 3;
 	const u32 dest = (instruction >> 21) & 0x1f;
 
-	src = getIndexedSource(src, idx);
-	vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor);
-	vec4f& destVector = getDest(dest);
-
-	u32 componentMask = operandDescriptor & 0xf;
-    */
+	loadRegister<1>(scratch1, shader, src, idx, operandDescriptor); // Load source 1 into scratch1
+	storeRegister(scratch1, shader, dest, operandDescriptor);
 }
 
 #endif
\ No newline at end of file