Merge pull request #98 from Wunkolo/modular-gl

Allow conditional OpenGL rendering backend
2025-07-19 05:22:27 +12:00 · 2023-07-16 03:48:07 +03:00 · 2023-07-16 03:48:07 +03:00 · 786c3e8a5c
commit 786c3e8a5c
parent 8f91b99672 a601686cb1
18 changed files with 545 additions and 407 deletions
--- a/include/PICA/dynapica/shader_rec.hpp
+++ b/include/PICA/dynapica/shader_rec.hpp
@ -21,7 +21,7 @@ class ShaderJIT {
 	ShaderCache cache;
 #endif

-public:
+  public:
 #ifdef PANDA3DS_SHADER_JIT_SUPPORTED
 	// Call this before starting to process a batch of vertices
 	// This will read the PICA config (uploaded shader and shader operand descriptors) and search if we've already compiled this shader
@ -29,9 +29,7 @@ public:
 	// The caller must make sure the entrypoint has been properly set beforehand
 	void prepare(PICAShader& shaderUnit);
 	void reset();
-	void run(PICAShader& shaderUnit) {
-		prologueCallback(shaderUnit, entrypointCallback);
-	}
+	void run(PICAShader& shaderUnit) { prologueCallback(shaderUnit, entrypointCallback); }

 	static constexpr bool isAvailable() { return true; }
 #else
@ -44,7 +42,7 @@ public:
 	}

 	// Define dummy callback. This should never be called if the shader JIT is not supported
-	using Callback = void(*)(PICAShader& shaderUnit);
+	using Callback = void (*)(PICAShader& shaderUnit);
 	Callback activeShaderCallback = nullptr;

 	void reset() {}
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@ -2,17 +2,17 @@

 // Only do anything if we're on an x64 target with JIT support enabled
 #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
-#include "helpers.hpp"
-#include "logger.hpp"
-#include "PICA/shader.hpp"
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-#include "x64_regs.hpp"
-
 #include <vector>

+#include "PICA/shader.hpp"
+#include "helpers.hpp"
+#include "logger.hpp"
+#include "x64_regs.hpp"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+
 class ShaderEmitter : public Xbyak::CodeGenerator {
-	static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
+	static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96;  // How much executable memory to alloc for each shader
 	// Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size
 	static constexpr size_t allocSize = executableMemorySize + 0x1000;

@ -20,7 +20,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	static constexpr uint noSwizzle = 0x1B;

 	using f24 = Floats::f24;
-	using vec4f = OpenGL::Vector<f24, 4>;
+	using vec4f = std::array<f24, 4>;

 	// An array of labels (incl pointers) to each compiled (to x64) PICA instruction
 	std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
@ -33,8 +33,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	// Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i)
 	Label onesVector;

-	u32 recompilerPC = 0; // PC the recompiler is currently recompiling @
-	u32 loopLevel = 0;    // The current loop nesting level (0 = not in a loop)
+	u32 recompilerPC = 0;  // PC the recompiler is currently recompiling @
+	u32 loopLevel = 0;     // The current loop nesting level (0 = not in a loop)

 	bool haveSSE4_1 = false;  // Shows if the CPU supports SSE4.1
 	bool haveAVX = false;     // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX)
@ -116,10 +116,12 @@ class ShaderEmitter : public Xbyak::CodeGenerator {

 	MAKE_LOG_FUNCTION(log, shaderJITLogger)

-public:
-	using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions
+  public:
+	// Callback type used for instructions
+	using InstructionCallback = const void (*)(PICAShader& shaderUnit);
 	// Callback type used for the JIT prologue. This is what the caller will call
-	using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb);
+	using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb);
+
 	PrologueCallback prologueCb = nullptr;

 	// Initialize our emitter with "allocSize" bytes of RWX memory
@ -134,7 +136,7 @@ public:
 			Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead");
 		}
 	}
-	
+
 	void compile(const PICAShader& shaderUnit);

 	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
@ -144,9 +146,7 @@ public:
 		return reinterpret_cast<InstructionCallback>(ptr);
 	}

-	PrologueCallback getPrologueCallback() {
-		return prologueCb;
-	}
+	PrologueCallback getPrologueCallback() { return prologueCb; }
 };

-#endif // x64 recompiler check
+#endif  // x64 recompiler check
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@ -1,39 +1,39 @@
 #pragma once
 #include <array>

+#include "PICA/dynapica/shader_rec.hpp"
+#include "PICA/float_types.hpp"
+#include "PICA/pica_vertex.hpp"
+#include "PICA/regs.hpp"
+#include "PICA/shader_unit.hpp"
 #include "config.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
 #include "memory.hpp"
-#include "PICA/float_types.hpp"
-#include "PICA/regs.hpp"
-#include "PICA/shader_unit.hpp"
-#include "PICA/dynapica/shader_rec.hpp"
-#include "renderer_gl/renderer_gl.hpp"
-#include "PICA/pica_vertex.hpp"
+#include "renderer.hpp"

 class GPU {
 	static constexpr u32 regNum = 0x300;
-	using vec4f = OpenGL::Vector<Floats::f24, 4>;
+	using vec4f = std::array<Floats::f24, 4>;
 	using Registers = std::array<u32, regNum>;

 	Memory& mem;
 	EmulatorConfig& config;
 	ShaderUnit shaderUnit;
-	ShaderJIT shaderJIT; // Doesn't do anything if JIT is disabled or not supported
+	ShaderJIT shaderJIT;  // Doesn't do anything if JIT is disabled or not supported

 	u8* vram = nullptr;
 	MAKE_LOG_FUNCTION(log, gpuLogger)

-	static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes
+	static constexpr u32 maxAttribCount = 12;  // Up to 12 vertex attributes
 	static constexpr u32 vramSize = u32(6_MB);
-	Registers regs; // GPU internal registers
-	std::array<vec4f, 16> currentAttributes; // Vertex attributes before being passed to the shader
+	Registers regs;                           // GPU internal registers
+	std::array<vec4f, 16> currentAttributes;  // Vertex attributes before being passed to the shader

-	std::array<vec4f, 16> immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission
+	std::array<vec4f, 16> immediateModeAttributes;  // Vertex attributes uploaded via immediate mode submission
 	std::array<PICA::Vertex, 3> immediateModeVertices;
 	uint immediateModeVertIndex;
-	uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
+	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading

 	template <bool indexed, bool useShaderJIT>
 	void drawArrays();
@ -42,35 +42,33 @@ class GPU {
 	void drawArrays(bool indexed);

 	struct AttribInfo {
-		u32 offset = 0; // Offset from base vertex array
-		int size = 0; // Bytes per vertex
+		u32 offset = 0;  // Offset from base vertex array
+		int size = 0;    // Bytes per vertex
 		u32 config1 = 0;
 		u32 config2 = 0;
-		u32 componentCount = 0; // Number of components for the attribute
+		u32 componentCount = 0;  // Number of components for the attribute

-		u64 getConfigFull() {
-			return u64(config1) | (u64(config2) << 32);
-		}
+		u64 getConfigFull() { return u64(config1) | (u64(config2) << 32); }
 	};

 	u64 getVertexShaderInputConfig() {
 		return u64(regs[PICA::InternalRegs::VertexShaderInputCfgLow]) | (u64(regs[PICA::InternalRegs::VertexShaderInputCfgHigh]) << 32);
 	}

-	std::array<AttribInfo, maxAttribCount> attributeInfo; // Info for each of the 12 attributes
-	u32 totalAttribCount = 0; // Number of vertex attributes to send to VS
-	u32 fixedAttribMask = 0; // Which attributes are fixed?
-	
-	u32 fixedAttribIndex = 0; // Which fixed attribute are we writing to ([0, 11] range)
-	u32 fixedAttribCount = 0; // How many attribute components have we written? When we get to 4 the attr will actually get submitted
-	std::array<u32, 3> fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted
+	std::array<AttribInfo, maxAttribCount> attributeInfo;  // Info for each of the 12 attributes
+	u32 totalAttribCount = 0;                              // Number of vertex attributes to send to VS
+	u32 fixedAttribMask = 0;                               // Which attributes are fixed?
+
+	u32 fixedAttribIndex = 0;          // Which fixed attribute are we writing to ([0, 11] range)
+	u32 fixedAttribCount = 0;          // How many attribute components have we written? When we get to 4 the attr will actually get submitted
+	std::array<u32, 3> fixedAttrBuff;  // Buffer to hold fixed attributes in until they get submitted

 	// Command processor pointers for GPU command lists
 	u32* cmdBuffStart = nullptr;
 	u32* cmdBuffEnd = nullptr;
 	u32* cmdBuffCurr = nullptr;

-	Renderer renderer;
+	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();

  public:
@ -84,11 +82,10 @@ class GPU {
 	// Set to false by the renderer when the lighting_lut is uploaded ot the GPU
 	bool lightingLUTDirty = false;

-	GPU(Memory& mem, GLStateManager& gl, EmulatorConfig& config);
-	void initGraphicsContext() { renderer.initGraphicsContext(); }
-	void getGraphicsContext() { renderer.getGraphicsContext(); }
-	void display() { renderer.display(); }
-	void screenshot(const std::string& name) { renderer.screenshot(name); }
+	GPU(Memory& mem, EmulatorConfig& config);
+	void initGraphicsContext() { renderer->initGraphicsContext(); }
+	void display() { renderer->display(); }
+	void screenshot(const std::string& name) { renderer->screenshot(name); }

 	void fireDMA(u32 dest, u32 source, u32 size);
 	void reset();
@ -107,13 +104,13 @@ class GPU {
 	// TODO: Emulate the transfer engine & its registers
 	// Then this can be emulated by just writing the appropriate values there
 	void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {
-		renderer.clearBuffer(startAddress, endAddress, value, control);
+		renderer->clearBuffer(startAddress, endAddress, value, control);
 	}

 	// TODO: Emulate the transfer engine & its registers
 	// Then this can be emulated by just writing the appropriate values there
 	void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
-		renderer.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
+		renderer->displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
 	}

 	// Read a value of type T from physical address paddr
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@ -2,13 +2,14 @@
 #include <algorithm>
 #include <array>
 #include <cstring>
-#include "helpers.hpp"
-#include "opengl.hpp"
+
 #include "PICA/float_types.hpp"
 #include "PICA/pica_hash.hpp"
+#include "helpers.hpp"

 enum class ShaderType {
-	Vertex, Geometry
+	Vertex,
+	Geometry,
 };

 namespace ShaderOpcodes {
@ -46,66 +47,66 @@ namespace ShaderOpcodes {
 		SETEMIT = 0x2B,
 		JMPC = 0x2C,
 		JMPU = 0x2D,
-		CMP1 = 0x2E, // Both of these instructions are CMP
+		CMP1 = 0x2E,  // Both of these instructions are CMP
 		CMP2 = 0x2F,
-		MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it
+		MAD = 0x38  // Everything between 0x38-0x3F is a MAD but fuck it
 	};
 }

 // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
 class PICAShader {
 	using f24 = Floats::f24;
-	using vec4f = OpenGL::Vector<f24, 4>;
+	using vec4f = std::array<f24, 4>;

 	struct Loop {
-		u32 startingPC; // PC at the start of the loop
-		u32 endingPC;   // PC at the end of the loop
-		u32 iterations; // How many iterations of the loop to run
-		u32 increment;  // How much to increment the loop counter after each iteration
+		u32 startingPC;  // PC at the start of the loop
+		u32 endingPC;    // PC at the end of the loop
+		u32 iterations;  // How many iterations of the loop to run
+		u32 increment;   // How much to increment the loop counter after each iteration
 	};

 	// Info for ifc/ifu stack
 	struct ConditionalInfo {
-		u32 endingPC; // PC at the end of the if block (= DST)
-		u32 newPC; // PC after the if block is done executing (= DST + NUM)
+		u32 endingPC;  // PC at the end of the if block (= DST)
+		u32 newPC;     // PC after the if block is done executing (= DST + NUM)
 	};

 	struct CallInfo {
-		u32 endingPC; // PC at the end of the function
-		u32 returnPC; // PC to return to after the function ends
+		u32 endingPC;  // PC at the end of the function
+		u32 returnPC;  // PC to return to after the function ends
 	};

-	int bufferIndex; // Index of the next instruction to overwrite for shader uploads
-	int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite
-	u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range)
-	u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer?
-	bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform?
+	int bufferIndex;                  // Index of the next instruction to overwrite for shader uploads
+	int opDescriptorIndex;            // Index of the next operand descriptor we'll overwrite
+	u32 floatUniformIndex = 0;        // Which float uniform are we writing to? ([0, 95] range)
+	u32 floatUniformWordCount = 0;    // How many words have we buffered for the current uniform transfer?
+	bool f32UniformTransfer = false;  // Are we transferring an f32 uniform or an f24 uniform?

-	std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data
+	std::array<u32, 4> floatUniformBuffer;  // Buffer for temporarily caching float uniform data

-public:
+  public:
 	// These are placed close to the temp registers and co because it helps the JIT generate better code
-	u32 entrypoint = 0; // Initial shader PC
+	u32 entrypoint = 0;  // Initial shader PC
 	u32 boolUniform;
-	std::array<OpenGL::Vector<u8, 4>, 4> intUniforms;
+	std::array<std::array<u8, 4>, 4> intUniforms;
 	alignas(16) std::array<vec4f, 96> floatUniforms;

-	alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
-	alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
+	alignas(16) std::array<vec4f, 16> fixedAttributes;  // Fixed vertex attributes
+	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
-	alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT
+	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT

-protected:
+  protected:
 	std::array<u32, 128> operandDescriptors;
-	alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
-	OpenGL::Vector<s32, 2> addrRegister; // Address register
-	bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
+	alignas(16) std::array<vec4f, 16> tempRegisters;  // General purpose registers the shader can use for temp values
+	std::array<s32, 2> addrRegister;                  // Address register
+	bool cmpRegister[2];                              // Comparison registers where the result of CMP is stored in
 	u32 loopCounter;

-	u32 pc = 0; // Program counter: Index of the next instruction we're going to execute
-	u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full)
-	u32 ifIndex = 0; // The index of our IF stack
-	u32 callIndex = 0; // The index of our CALL stack
+	u32 pc = 0;         // Program counter: Index of the next instruction we're going to execute
+	u32 loopIndex = 0;  // The index of our loop stack (0 = empty, 4 = full)
+	u32 ifIndex = 0;    // The index of our IF stack
+	u32 callIndex = 0;  // The index of our CALL stack

 	std::array<Loop, 4> loopInfo;
 	std::array<ConditionalInfo, 8> conditionalInfo;
@ -117,7 +118,7 @@ protected:
 	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
 	using Hash = PICAHash::HashType;

-	Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
+	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)

 	bool codeHashDirty = false;
@ -130,7 +131,7 @@ protected:
 	vec4f getSource(u32 source);
 	vec4f& getDest(u32 dest);

-private:
+  private:
 	// Interpreter functions for the various shader functions
 	void add(u32 instruction);
 	void call(u32 instruction);
@ -171,13 +172,13 @@ private:
 		bool negate;

 		using namespace Helpers;
-		if constexpr (sourceIndex == 1) { // SRC1
+		if constexpr (sourceIndex == 1) {  // SRC1
 			negate = (getBit<4>(opDescriptor)) != 0;
 			compSwizzle = getBits<5, 8>(opDescriptor);
-		} else if constexpr (sourceIndex == 2) { // SRC2
+		} else if constexpr (sourceIndex == 2) {  // SRC2
 			negate = (getBit<13>(opDescriptor)) != 0;
 			compSwizzle = getBits<14, 8>(opDescriptor);
-		} else if constexpr (sourceIndex == 3) { // SRC3
+		} else if constexpr (sourceIndex == 3) {  // SRC3
 			negate = (getBit<22>(opDescriptor)) != 0;
 			compSwizzle = getBits<23, 8>(opDescriptor);
 		}
@ -185,8 +186,8 @@ private:
 		// Iterate through every component of the swizzled vector in reverse order
 		// And get which source component's index to match it with
 		for (int comp = 0; comp < 4; comp++) {
-			int index = compSwizzle & 3; // Get index for this component
-			compSwizzle >>= 2; // Move to next component index
+			int index = compSwizzle & 3;  // Get index for this component
+			compSwizzle >>= 2;            // Move to next component index
 			ret[3 - comp] = source[index];
 		}

@ -212,39 +213,35 @@ private:
 	u8 getIndexedSource(u32 source, u32 index);
 	bool isCondTrue(u32 instruction);

-public:
+  public:
 	static constexpr size_t maxInstructionCount = 4096;
-	std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
-	std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
+	std::array<u32, maxInstructionCount> loadedShader;    // Currently loaded & active shader
+	std::array<u32, maxInstructionCount> bufferedShader;  // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to

 	PICAShader(ShaderType type) : type(type) {}

 	// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
-	void finalize() {
-		std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32));
-	}
+	void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); }

-	void setBufferIndex(u32 index) {
-		bufferIndex = index & 0xfff;
-	}
-
-	void setOpDescriptorIndex(u32 index) {
-		opDescriptorIndex = index & 0x7f;
-	}
+	void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; }
+	void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; }

 	void uploadWord(u32 word) {
-		if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew");
+		if (bufferIndex >= 4095) {
+			Helpers::panic("o no, shader upload overflew");
+		}
+
 		bufferedShader[bufferIndex++] = word;
 		bufferIndex &= 0xfff;

-		codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed
+		codeHashDirty = true;  // Signal the JIT if necessary that the program hash has potentially changed
 	}

 	void uploadDescriptor(u32 word) {
 		operandDescriptors[opDescriptorIndex++] = word;
 		opDescriptorIndex &= 0x7f;

-		opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed
+		opdescHashDirty = true;  // Signal the JIT if necessary that the program hash has potentially changed
 	}

 	void setFloatUniformIndex(u32 word) {
@ -255,23 +252,24 @@ public:

 	void uploadFloatUniform(u32 word) {
 		floatUniformBuffer[floatUniformWordCount++] = word;
-		if (floatUniformIndex >= 96)
+		if (floatUniformIndex >= 96) {
 			Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex);
+		}

 		if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) {
 			vec4f& uniform = floatUniforms[floatUniformIndex++];
 			floatUniformWordCount = 0;

 			if (f32UniformTransfer) {
-				uniform.x() = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
-				uniform.y() = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
-				uniform.z() = f24::fromFloat32(*(float*)&floatUniformBuffer[1]);
-				uniform.w() = f24::fromFloat32(*(float*)&floatUniformBuffer[0]);
+				uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
+				uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
+				uniform[2] = f24::fromFloat32(*(float*)&floatUniformBuffer[1]);
+				uniform[3] = f24::fromFloat32(*(float*)&floatUniformBuffer[0]);
 			} else {
-				uniform.x() = f24::fromRaw(floatUniformBuffer[2] & 0xffffff);
-				uniform.y() = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24));
-				uniform.z() = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
-				uniform.w() = f24::fromRaw(floatUniformBuffer[0] >> 8);
+				uniform[0] = f24::fromRaw(floatUniformBuffer[2] & 0xffffff);
+				uniform[1] = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24));
+				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
+				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
 		}
 	}
@ -280,10 +278,10 @@ public:
 		using namespace Helpers;

 		auto& u = intUniforms[index];
-		u.x() = word & 0xff;
-		u.y() = getBits<8, 8>(word);
-		u.z() = getBits<16, 8>(word);
-		u.w() = getBits<24, 8>(word);
+		u[0] = word & 0xff;
+		u[1] = getBits<8, 8>(word);
+		u[2] = getBits<16, 8>(word);
+		u[3] = getBits<24, 8>(word);
 	}

 	void run();