Get first renders working with accelerated draws

2025-07-27 17:10:47 +12:00 · 2024-08-25 17:14:19 +03:00 · 2024-08-25 17:14:19 +03:00 · 5432a5a0d8
commit 5432a5a0d8
parent 33e63f7d7a
6 changed files with 63 additions and 78 deletions
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@ -12,6 +12,7 @@ namespace PICA {
 			u8* data;
 			u32 offset;
 			u32 size;
 			u32 stride;
 			u8 type;
 			u8 componentCount;
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
 	bool oldDepthmapEnable = false;
 	// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
 	bool usingAcceleratedShader = false;
 	bool performIndexedRender = false;
 	// Cached pointer to the current vertex shader when using HW accelerated shaders
 	OpenGL::Shader* generatedVertexShader = nullptr;
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
 				attr.size = size * sizePerComponent[attribType];
 				attr.stride = attrData.size;
 				attr.type = attribType;
 				attr.isPadding = false;
 				attributeOffset += attr.size;
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -120,6 +120,8 @@ void GPU::reset() {
 	renderer->reset();
 }
 static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
 	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
 	if (hwShaders) {
-		if (indexed) {
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
-			drawArrays<true, ShaderExecMode::Hardware>();
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
-		} else {
+		// Total # of vertices to render
-			drawArrays<false, ShaderExecMode::Hardware>();
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
-		}
+
 		// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
 		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 	} else {
 		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
 	}
 }
 // We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
 // which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
 // of 16 vec4 attributes
 union PICAVertexBuffer {
 	// Used with CPU shaders
 	std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
 	// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
 	std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
 	PICAVertexBuffer() {}
 };
 static PICAVertexBuffer vertexBuffer;
 template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
 	if constexpr (mode == ShaderExecMode::JIT) {
 		shaderJIT.prepare(shaderUnit.vs);
 	} else if constexpr (mode == ShaderExecMode::Hardware) {
 		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
 		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
 	}
 	// We can have up to 16 attributes, each one consisting of 4 floats
 	constexpr u32 maxAttrSizeInFloats = 16 * 4;
 	auto& vertices = vertexBuffer.vertices;
 	if constexpr (mode != ShaderExecMode::Hardware) {
 		setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
 	}
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@ -257,15 +245,7 @@ void GPU::drawArrays() {
 			size_t tag = vertexIndex % vertexCacheSize;
 			// Cache hit
 			if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
-				if constexpr (mode != ShaderExecMode::Hardware) {
+				vertices[i] = vertices[cache.bufferPositions[tag]];
 					vertices[i] = vertices[cache.bufferPositions[tag]];
 				} else {
 					const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
 					std::memcpy(
 						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
 						sizeof(float) * maxAttrSizeInFloats
 					);
 				}
 				continue;
 			}
@ -370,39 +350,29 @@ void GPU::drawArrays() {
 			}
 		}
-		// Running shader on the CPU instead of the GPU
+		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
-		if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
+		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-			// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
-			// Based on the SH_ATTRIBUTES_PERMUTATION registers.
+		for (int j = 0; j < totalAttribCount; j++) {
-			// Ie it might map attribute #0 to v2, #1 to v7, etc
+			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-			for (int j = 0; j < totalAttribCount; j++) {
+			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
-				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
+		}
 				std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
 			}
-			if constexpr (mode == ShaderExecMode::JIT) {
+		if constexpr (mode == ShaderExecMode::JIT) {
-				shaderJIT.run(shaderUnit.vs);
+			shaderJIT.run(shaderUnit.vs);
-			} else {
+		} else {
-				shaderUnit.vs.run();
+			shaderUnit.vs.run();
-			}
+		}
-			PICA::Vertex& out = vertices[i];
+		PICA::Vertex& out = vertices[i];
-			// Map shader outputs to fixed function properties
+		// Map shader outputs to fixed function properties
-			const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+		const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
-			for (int i = 0; i < totalShaderOutputs; i++) {
+		for (int i = 0; i < totalShaderOutputs; i++) {
-				const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
-				for (int j = 0; j < 4; j++) {  // pls unroll
+			for (int j = 0; j < 4; j++) {  // pls unroll
-					const u32 mapping = (config >> (j * 8)) & 0x1F;
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
-					out.raw[mapping] = vsOutputRegisters[i][j];
+				out.raw[mapping] = vsOutputRegisters[i][j];
 				}
 			}
 		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
 			float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
 			for (int j = 0; j < totalAttribCount; j++) {
 				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
 				// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
 				std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
 			}
 		}
 	}
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
 	// Initialize the VAO used for hw shaders
 	hwShaderVAO.create();
 	gl.bindVAO(hwShaderVAO);
 	for (int attr = 0; attr < 16; attr++) {
 		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
 		hwShaderVAO.enableAttribute(attr);
 	}
 	dummyVBO.create();
 	dummyVAO.create();
@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	vbo.bind();
+
-	gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
+	if (usingAcceleratedShader) {
 		hwVertexBuffer->Bind();
 		gl.bindVAO(hwShaderVAO);
 	} else {
 		vbo.bind();
 		gl.bindVAO(defaultVAO);
 	}
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	setupStencilTest(stencilEnable);
 	// If we're using hardware shaders, the vertex array works completely different
 	// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
 	if (!usingAcceleratedShader) {
 		vbo.bufferVertsSub(vertices);
 		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
-		glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
+		if (performIndexedRender) {
 			// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
 			hwIndexBuffer->Bind();
 			//glDrawRangeElementsBaseVertex();
 		} else {
 			// When doing non-indexed rendering, just use glDrawArrays
 			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 		}
 	}
 	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 }
 void RendererGL::display() {
@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 			// Upload vertex data and index buffer data to our GPU
 			accelerateVertexUpload(shaderUnit, accel);
 			performIndexedRender = accel->indexed;
 		}
 	}
@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	}
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
 	gl.bindVAO(hwShaderVAO);
 	for (int i = 0; i < totalAttribCount; i++) {
 		const auto& attrib = accel->attributeInfo[i];
@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 				continue;
 			}
-			const u32 attributeSize = attrib.size * vertexCount;
+			glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
 			// TODO: Disable unused attributes as well
 			hwShaderVAO.enableAttribute(i);
 			const u32 attributeSize = attrib.size * vertexCount;
 			std::memcpy(vertexData, attrib.data, attributeSize);
 			vertexData += attributeSize;
 		}
 	}
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@ -132,7 +132,7 @@ namespace {
 			const u32 end = GetSyncIndexForOffset(offset);
 			for (; m_used_block_index < end; m_used_block_index++) {
 				if (m_sync_objects[m_used_block_index]) {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
 				}
 				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
@ -149,7 +149,7 @@ namespace {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
 				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
 				}
 				WaitForSync(m_sync_objects[m_available_block_index]);