diff --git a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
index 4dd21174..671ae120 100644
--- a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
@@ -27,6 +27,9 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
 	// Filled before compiling a shader by scanning the code for call instructions
 	std::vector<u32> returnPCs;
 
+	// An array of 128-bit masks for blending registers together to perform masked writes.
+	// Eg for writing only the x and y components, the mask is 0x00000000'00000000'FFFFFFFF'FFFF
+	oaknut::Label blendMasks;
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	oaknut::Label onesVector;
 
diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
index 750adc81..43121d86 100644
--- a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
@@ -15,9 +15,37 @@ static constexpr QReg src1_vec = Q2;
 static constexpr QReg src2_vec = Q3;
 static constexpr QReg src3_vec = Q4;
 
+static constexpr XReg arg1 = X0;
+static constexpr XReg arg2 = X1;
 static constexpr XReg statePointer = X9;
 
 void ShaderEmitter::compile(const PICAShader& shaderUnit) {
+	oaknut::CodeBlock::unprotect();  // Unprotect the memory before writing to it
+
+	// Constants
+	align(16);
+	// Generate blending masks for doing masked writes to registers
+	l(blendMasks);
+	for (int i = 0; i < 16; i++) {
+		dw((i & 0x8) ? 0xFFFFFFFF : 0);  // Mask for x component
+		dw((i & 0x4) ? 0xFFFFFFFF : 0);  // Mask for y component
+		dw((i & 0x2) ? 0xFFFFFFFF : 0);  // Mask for z component
+		dw((i & 0x1) ? 0xFFFFFFFF : 0);  // Mask for w component
+	}
+
+	// Emit prologue first
+	oaknut::Label prologueLabel;
+	align(16);
+
+	l(prologueLabel);
+	prologueCb = prologueLabel.ptr<PrologueCallback>();
+
+	// Set state pointer to the proper pointer
+	// state pointer is volatile, no need to preserve it
+	MOV(statePointer, arg1);
+	// Jump to code with a tail call
+	BR(arg2);
+
 	// Scan the code for call, exp2, log2, etc instructions which need some special care
 	// After that, emit exp2 and log2 functions if the corresponding instructions are present
 	scanCode(shaderUnit);
@@ -30,6 +58,10 @@ void ShaderEmitter::compile(const PICAShader& shaderUnit) {
 	recompilerPC = 0;
 	loopLevel = 0;
 	compileUntil(shaderUnit, PICAShader::maxInstructionCount);
+
+	// Protect the memory and invalidate icache before executing the code
+	oaknut::CodeBlock::protect();
+	oaknut::CodeBlock::invalidate_all();
 }
 
 void ShaderEmitter::scanCode(const PICAShader& shaderUnit) {
@@ -76,17 +108,17 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 	const u32 opcode = instruction >> 26;
 
 	switch (opcode) {
-		// case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
+		case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break;
 		// case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break;
 		// case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break;
 		// case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break;
 		// case ShaderOpcodes::CMP1:
 		// case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break;
-		// case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
-		// case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
+		case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
+		case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
 		// case ShaderOpcodes::DPH:
 		// case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break;
-		// case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
+		case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
 		// case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
 		// case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
 		// case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break;
@@ -97,42 +129,43 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 		// case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break;
 		case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break;
 		// case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break;
-		// case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
-		// case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
-		// case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
-		case ShaderOpcodes::NOP:
+		case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break;
+		case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break;
+		case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break;
+		case ShaderOpcodes::NOP: break;
+		// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
+		case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
+
+		// Unimplemented opcodes that don't seem to actually be used but exist in the binary
+		// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
+		// case ShaderOpcodes::EMIT:
+		// case ShaderOpcodes::SETEMIT:
+		//	log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
+		//	emitPrintLog(shaderUnit);
+		//	break;
+
+		// case ShaderOpcodes::BREAK:
+		// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
+
+		// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
+		case 0x30:
+		case 0x31:
+		case 0x32:
+		case 0x33:
+		case 0x34:
+		case 0x35:
+		case 0x36:
+		case 0x37:
+		case 0x38:
+		case 0x39:
+		case 0x3A:
+		case 0x3B:
+		case 0x3C:
+		case 0x3D:
+		case 0x3E:
+		case 0x3F:
+			recMAD(shaderUnit, instruction);
 			break;
-			// case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break;
-			// case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break;
-
-			// Unimplemented opcodes that don't seem to actually be used but exist in the binary
-			// EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders?
-			// case ShaderOpcodes::EMIT:
-			// case ShaderOpcodes::SETEMIT:
-			//	log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode);
-			//	emitPrintLog(shaderUnit);
-			//	break;
-
-			// case ShaderOpcodes::BREAK:
-			// case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break;
-
-			// We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD
-			// case 0x30:
-			// case 0x31:
-			// case 0x32:
-			// case 0x33:
-			// case 0x34:
-			// case 0x35:
-			// case 0x36:
-			// case 0x37:
-			// case 0x38:
-			// case 0x39:
-			// case 0x3A:
-			// case 0x3B:
-			// case 0x3C:
-			// case 0x3D:
-			// case 0x3E:
-			// case 0x3F: recMAD(shaderUnit, instruction); break;
 
 			// case ShaderOpcodes::SLT:
 			// case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break;
@@ -221,9 +254,165 @@ void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 des
 	if (writeMask == 0xf) {  // No lanes are masked, just use STR
 		STR(source, statePointer, offset);
 	} else {
-        LDR(scratch1, statePointer, offset); // Load current source
-        Helpers::panic("Unimplemented: Storing to register with blending");
-    }
+		LDR(scratch1, statePointer, offset);                    // Load current value
+		LDR(scratch2, blendMasks.ptr<u8*>() + writeMask * 16);  // Load write mask for blending
+
+		BSL(scratch2.B16(), source.B16(), scratch1.B16());  // Scratch2 = (Source & mask) | (original & ~mask)
+		STR(scratch2, statePointer, offset);                // Write it back
+	}
+}
+
+void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+	const u32 writeMask = getBits<0, 4>(operandDescriptor);
+
+	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+	// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
+	INS(src1_vec.Selem()[3], WZR);
+
+	// Now do a full DP4
+	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());   // Do a piecewise multiplication of the vectors first
+	FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4());  // Now add the adjacent components together
+	FADDP(src1_vec.toS(), src1_vec.toD().S2());          // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
+
+	if (writeMask != 0x8) {                       // Copy bottom lane to all lanes if we're not simply writing back x
+		DUP(src1_vec.S4(), src1_vec.Selem()[0]);  // src1_vec = src1_vec.xxxx
+	}
+
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+	const u32 writeMask = getBits<0, 4>(operandDescriptor);
+
+	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+
+	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());   // Do a piecewise multiplication of the vectors first
+	FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4());  // Now add the adjacent components together
+	FADDP(src1_vec.toS(), src1_vec.toD().S2());          // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
+
+	if (writeMask != 0x8) {                       // Copy bottom lane to all lanes if we're not simply writing back x
+		DUP(src1_vec.S4(), src1_vec.Selem()[0]);  // src1_vec = src1_vec.xxxx
+	}
+
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+	FADD(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+	FMAX(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recMIN(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+	FMIN(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
+	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
+	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recRSQ(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src = getBits<12, 7>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+	const u32 writeMask = operandDescriptor & 0xf;
+	constexpr bool useAccurateRSQ = false;
+
+	loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor);  // Load source 1 into scratch1
+
+	// Compute reciprocal square root approximation
+	// TODO: Should this use frsqte or fsqrt+div? The former is faster but less accurate
+	// PICA RSQ uses f24 precision though, so it'll be inherently innacurate, and it's likely using an inaccurate approximation too, seeing as
+	// It doesn't have regular sqrt/div instructions.
+	// For now, we default to inaccurate inverse square root
+	if constexpr (useAccurateRSQ) {
+		FMOV(scratch1.S4(), FImm8(0x70));                      // scratch1 = vec4(1.0f)
+		FSQRT(src1_vec.toS(), src1_vec.toS());                 // src1 = sqrt(src1), scalar
+		FDIV(src1_vec.toS(), scratch1.toS(), src1_vec.toS());  // Now invert src1
+	} else {
+		FRSQRTE(src1_vec.toS(), src1_vec.toS());  // Much nicer
+	}
+
+	// If we only write back the x component to the result, we needn't perform a shuffle to do res = res.xxxx
+	// Otherwise we do
+	if (writeMask != 0x8) {                       // Copy bottom lane to all lanes if we're not simply writing back x
+		DUP(src1_vec.S4(), src1_vec.Selem()[0]);  // src1_vec = src1_vec.xxxx
+	}
+
+	storeRegister(src1_vec, shader, dest, operandDescriptor);
+}
+
+void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
+	const bool isMADI = getBit<29>(instruction) == 0;
+
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
+	const u32 src1 = getBits<17, 5>(instruction);
+	const u32 src2 = isMADI ? getBits<12, 5>(instruction) : getBits<10, 7>(instruction);
+	const u32 src3 = isMADI ? getBits<5, 7>(instruction) : getBits<5, 5>(instruction);
+	const u32 idx = getBits<22, 2>(instruction);
+	const u32 dest = getBits<24, 5>(instruction);
+
+	loadRegister<1>(src1_vec, shader, src1, 0, operandDescriptor);
+	loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
+	loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
+
+	// TODO: Safe PICA multiplication
+	FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	storeRegister(src3_vec, shader, dest, operandDescriptor);
 }
 
 void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
@@ -236,4 +425,6 @@ void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) {
 	storeRegister(src1_vec, shader, dest, operandDescriptor);
 }
 
+void ShaderEmitter::recEND(const PICAShader& shader, u32 instruction) { RET(); }
+
 #endif
\ No newline at end of file