Rewriting hw vertex fetch

2025-05-24 12:36:25 +12:00 · 2024-09-04 03:18:39 +03:00 · 2024-09-04 03:18:39 +03:00 · 4a39b06262
commit 4a39b06262
parent 15b6a9e2d9
5 changed files with 107 additions and 95 deletions
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@ -6,32 +6,37 @@

 namespace PICA {
 	struct DrawAcceleration {
-		static constexpr u32 maxAttribCount = 12;
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;

 		struct AttributeInfo {
-			u8* data;
 			u32 offset;
-			u32 size;
 			u32 stride;

-			u8 inputReg; // Which input reg should this attribute go to in the vertex shader?
 			u8 type;
 			u8 componentCount;
-			bool fixed;
-			bool isPadding;

 			std::array<float, 4> fixedValue;  // For fixed attributes
 		};

+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
 		u8* indexBuffer;

 		// Minimum and maximum index in the index buffer for a draw call
 		u16 minimumIndex, maximumIndex;
 		u32 totalAttribCount;
+		u32 totalLoaderCount;
 		u32 enabledAttributeMask;
+		u32 fixedAttributes;
 		u32 vertexDataSize;

 		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;

 		bool canBeAccelerated;
 		bool indexed;
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@ -1,5 +1,6 @@
 #include "PICA/draw_acceleration.hpp"

+#include <bit>
 #include <limits>

 #include "PICA/gpu.hpp"
@ -53,88 +54,94 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
 	const u64 inputAttrCfg = getVertexShaderInputConfig();

-	u32 buffer = 0;
 	u32 attrCount = 0;
+	u32 loaderOffset = 0;
 	accel.vertexDataSize = 0;
+	accel.totalLoaderCount = 0;

-	while (attrCount < totalAttribCount) {
-		bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0;
+	for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
+		auto& loaderData = attributeInfo[i];  // Get information for this attribute loader

-		// Variable attribute attribute
-		if (!fixedAttrib) {
-			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
-			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
+		// This loader is empty, skip it
+		if (loaderData.componentCount == 0 || loaderData.size == 0) {
+			continue;
+		}

-			if (attrData.componentCount != 0) {
-				// Size of the attribute in bytes multiplied by the total number of vertices
-				const u32 bytes = attrData.size * vertexCount;
-				// Add it to the total vertex data size, aligned to 4 bytes.
-				accel.vertexDataSize += (bytes + 3) & ~3;
+		auto& loader = accel.loaders[accel.totalLoaderCount++];
+
+		// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
+		// Which is equal to maximumIndex - minimumIndex + 1
+		const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
+		loader.size = bytes;
+
+		// Add it to the total vertex data size, aligned to 4 bytes.
+		accel.vertexDataSize += (bytes + 3) & ~3;
+		
+		// Get a pointer to the data where this loader's data is stored
+		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
+		loader.data = getPointerPhys<u8>(loaderAddress);
+
+		u64 attrCfg = loaderData.getConfigFull();  // Get config1 | (config2 << 32)
+		u32 attributeOffset = 0;
+
+		for (int component = 0; component < loaderData.componentCount; component++) {
+			uint attributeIndex = (attrCfg >> (component * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+			// Vertex attributes used as padding
+			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+			if (attributeIndex >= 12) [[unlikely]] {
+				Helpers::panic("Padding attribute");
+				// Align attribute address up to a 4 byte boundary
+				attributeOffset = (attributeOffset + 3) & -4;
+				attributeOffset += (attributeIndex - 11) << 2;
+				continue;
 			}

-			u32 attributeOffset = 0;
-			for (int i = 0; i < attrData.componentCount; i++) {
-				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
-				auto& attr = accel.attributeInfo[attrCount];
-				attr.fixed = false;
+			const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
+			const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+			const u32 size = (attribInfo >> 2) + 1;   // Total number of components

-				// Vertex attributes used as padding
-				// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
-				if (index >= 12) [[unlikely]] {
-					Helpers::panic("Padding attribute");
-					// Align attribute address up to a 4 byte boundary
-					attributeOffset = (attributeOffset + 3) & -4;
-					attributeOffset += (index - 11) << 2;
+			// Size of each component based on the attribute type
+			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
+			// Mark the attribute as enabled
+			accel.enabledAttributeMask |= 1 << inputReg;

-					attr.data = nullptr;
-					attr.isPadding = true;
-					continue;
-				}
+			auto& attr = accel.attributeInfo[inputReg];
+			attr.componentCount = size;
+			attr.offset = attributeOffset + loaderOffset;
+			attr.stride = loaderData.size;
+			attr.type = attribType;
+			attributeOffset += size * sizePerComponent[attribType];
+		}

-				const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
-				const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
-				const u32 size = (attribInfo >> 2) + 1;   // Total number of components
-			
-				// Size of each component based on the attribute type
-				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
-				const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
-				// Mark the attribute as enabled
-				accel.enabledAttributeMask |= 1 << inputReg;
+		loaderOffset += loader.size;
+	}

-				// Get a pointer to the data where this attribute is stored
-				const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size);
+	u32 fixedAttributes = fixedAttribMask;
+	accel.fixedAttributes = 0;

-				attr.data = getPointerPhys<u8>(attrAddress);
-				attr.inputReg = inputReg;
-				attr.componentCount = size;
-				attr.offset = attributeOffset;
-				attr.size = size * sizePerComponent[attribType];
-				attr.stride = attrData.size;
-				attr.type = attribType;
-				attr.isPadding = false;
-				attributeOffset += attr.size;
+	// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
+	while (fixedAttributes != 0) {
+		// Get index of next fixed attribute and turn it off
+		const u32 index = std::countr_zero<u32>(fixedAttributes);
+		const u32 mask = 1u << index;
+		fixedAttributes ^= mask;

-				attrCount += 1;
-			}
+		// PICA register this fixed attribute is meant to go to
+		const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
+		const u32 inputRegMask = 1u << inputReg;

-			buffer += 1;
-		} else {
-			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];
-			auto& attr = accel.attributeInfo[attrCount];
+		// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
+		if ((accel.enabledAttributeMask & inputRegMask) == 0) {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
+			auto& attr = accel.attributeInfo[inputReg];

-			attr.fixed = true;
-			// Set the data pointer to nullptr in order to catch any potential bugs
-			attr.data = nullptr;
-			attr.isPadding = false;
+			accel.fixedAttributes |= inputRegMask;

 			for (int i = 0; i < 4; i++) {
 				attr.fixedValue[i] = fixedAttr[i].toFloat32();
 			}
-
-			const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
-
-			attr.inputReg = inputReg;
-			attrCount += 1;
 		}
 	}

--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -337,8 +337,6 @@ void GPU::drawArrays() {
 					}

 					// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
-					// Corgi does this although I'm not sure if it's actually needed for anything.
-					// TODO: Find out
 					while (component < 4) {
 						attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
 						component++;
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -508,7 +508,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
 		if (performIndexedRender) {
-			// When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
+			// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
 			hwIndexBuffer->Bind();
 			glDrawRangeElementsBaseVertex(
 				primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
@ -1165,12 +1165,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	hwVertexBuffer->Bind();
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;

 	gl.bindVAO(hwShaderVAO);

 	// Enable or disable vertex attributes as needed
 	const u32 currentAttributeMask = accel->enabledAttributeMask;
-	// Use bitwise xor to calculate which attributes chanced
+	// Use bitwise xor to calculate which attributes changed
 	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
 	
 	while (attributeMaskDiff != 0) {
@ -1190,29 +1191,30 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele

 	previousAttributeMask = currentAttributeMask;

-	for (int i = 0; i < totalAttribCount; i++) {
-		const auto& attrib = accel->attributeInfo[i];
+	// Upload the data for each (enabled) attribute loader into our vertex buffer
+	for (int i = 0; i < accel->totalLoaderCount; i++) {
+		auto& loader = accel->loaders[i];

-		if (attrib.fixed) {
-			if ((currentAttributeMask & (1u << i)) == 0) {
-				glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
-			}
-		} else {
-			if (attrib.isPadding) [[unlikely]] {
-				continue;
-			}
-	
-			const u32 attributeSize = attrib.size * vertexCount;
-			std::memcpy(vertexData, attrib.data, attributeSize);
-
-			vertexData += attributeSize;
-
-			glVertexAttribPointer(
-				attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
-				reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset)
-			);
-		}
+		std::memcpy(vertexData, loader.data, loader.size);
+		vertexData += loader.size;
 	}

 	hwVertexBuffer->Unmap(accel->vertexDataSize);
+
+	// Iterate over the 16 PICA input registers and configure how they should be fetched.
+	for (int i = 0; i < 16; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		const u32 attributeMask = 1u << i;
+
+		if (accel->fixedAttributes & attributeMask) {
+			// This is a fixed attribute, so set its fixed value
+			// TODO: Don't update these if the value does not change, it generates way too many calls
+			glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+		} else if (accel->enabledAttributeMask & attributeMask) {
+			glVertexAttribPointer(
+				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
+			);
+		}
+	}
 }
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@ -149,7 +149,7 @@ namespace {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
 				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
-					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
 				}

 				WaitForSync(m_sync_objects[m_available_block_index]);