From 5432a5a0d87ed17a81b7ac865f8b06413b893821 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:14:19 +0300
Subject: [PATCH] Get first renders working with accelerated draws

---
 include/PICA/draw_acceleration.hpp           |  1 +
 include/renderer_gl/renderer_gl.hpp          |  1 +
 src/core/PICA/draw_acceleration.cpp          |  1 +
 src/core/PICA/gpu.cpp                        | 96 +++++++-------------
 src/core/renderer_gl/renderer_gl.cpp         | 38 +++++---
 third_party/duckstation/gl/stream_buffer.cpp |  4 +-
 6 files changed, 63 insertions(+), 78 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index 2ec3f318..1671825e 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -12,6 +12,7 @@ namespace PICA {
 			u8* data;
 			u32 offset;
 			u32 size;
+			u32 stride;
 
 			u8 type;
 			u8 componentCount;
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 16286484..b643534a 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
 	bool oldDepthmapEnable = false;
 	// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
 	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
 
 	// Cached pointer to the current vertex shader when using HW accelerated shaders
 	OpenGL::Shader* generatedVertexShader = nullptr;
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 22b1f041..7646577f 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
 				attr.size = size * sizePerComponent[attribType];
+				attr.stride = attrData.size;
 				attr.type = attribType;
 				attr.isPadding = false;
 				attributeOffset += attr.size;
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 64dc5beb..dad24a22 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -120,6 +120,8 @@ void GPU::reset() {
 	renderer->reset();
 }
 
+static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
@@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
 	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
 
 	if (hwShaders) {
-		if (indexed) {
-			drawArrays<true, ShaderExecMode::Hardware>();
-		} else {
-			drawArrays<false, ShaderExecMode::Hardware>();
-		}
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
+		// Total # of vertices to render
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
+
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
+		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 	} else {
 		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
 
@@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
 	}
 }
 
-// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
-// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
-// of 16 vec4 attributes
-union PICAVertexBuffer {
-	// Used with CPU shaders
-	std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
-	// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
-	std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
-
-	PICAVertexBuffer() {}
-};
-
-static PICAVertexBuffer vertexBuffer;
-
 template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
 	if constexpr (mode == ShaderExecMode::JIT) {
 		shaderJIT.prepare(shaderUnit.vs);
+	} else if constexpr (mode == ShaderExecMode::Hardware) {
+		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
+		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
 	}
 
 	// We can have up to 16 attributes, each one consisting of 4 floats
 	constexpr u32 maxAttrSizeInFloats = 16 * 4;
-	auto& vertices = vertexBuffer.vertices;
-
-	if constexpr (mode != ShaderExecMode::Hardware) {
-		setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
-	}
 
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@@ -257,15 +245,7 @@ void GPU::drawArrays() {
 			size_t tag = vertexIndex % vertexCacheSize;
 			// Cache hit
 			if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
-				if constexpr (mode != ShaderExecMode::Hardware) {
-					vertices[i] = vertices[cache.bufferPositions[tag]];
-				} else {
-					const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
-					std::memcpy(
-						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
-						sizeof(float) * maxAttrSizeInFloats
-					);
-				}
+				vertices[i] = vertices[cache.bufferPositions[tag]];
 				continue;
 			}
 
@@ -370,39 +350,29 @@ void GPU::drawArrays() {
 			}
 		}
 
-		// Running shader on the CPU instead of the GPU
-		if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
-			// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
-			// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-			// Ie it might map attribute #0 to v2, #1 to v7, etc
-			for (int j = 0; j < totalAttribCount; j++) {
-				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
-			}
+		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
+		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
+		for (int j = 0; j < totalAttribCount; j++) {
+			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
+			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
+		}
 
-			if constexpr (mode == ShaderExecMode::JIT) {
-				shaderJIT.run(shaderUnit.vs);
-			} else {
-				shaderUnit.vs.run();
-			}
+		if constexpr (mode == ShaderExecMode::JIT) {
+			shaderJIT.run(shaderUnit.vs);
+		} else {
+			shaderUnit.vs.run();
+		}
 
-			PICA::Vertex& out = vertices[i];
-			// Map shader outputs to fixed function properties
-			const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
-			for (int i = 0; i < totalShaderOutputs; i++) {
-				const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+		PICA::Vertex& out = vertices[i];
+		// Map shader outputs to fixed function properties
+		const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+		for (int i = 0; i < totalShaderOutputs; i++) {
+			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
 
-				for (int j = 0; j < 4; j++) {  // pls unroll
-					const u32 mapping = (config >> (j * 8)) & 0x1F;
-					out.raw[mapping] = vsOutputRegisters[i][j];
-				}
-			}
-		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
-			float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
-			for (int j = 0; j < totalAttribCount; j++) {
-				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
-				std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
+			for (int j = 0; j < 4; j++) {  // pls unroll
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				out.raw[mapping] = vsOutputRegisters[i][j];
 			}
 		}
 	}
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index fc6e2ce6..82248d53 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
 
 	// Initialize the VAO used for hw shaders
 	hwShaderVAO.create();
-	gl.bindVAO(hwShaderVAO);
-	for (int attr = 0; attr < 16; attr++) {
-		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
-		hwShaderVAO.enableAttribute(attr);
-	}
 
 	dummyVBO.create();
 	dummyVAO.create();
@@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	vbo.bind();
-	gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
+
+	if (usingAcceleratedShader) {
+		hwVertexBuffer->Bind();
+		gl.bindVAO(hwShaderVAO);
+	} else {
+		vbo.bind();
+		gl.bindVAO(defaultVAO);
+	}
 
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	setupStencilTest(stencilEnable);
 
-	// If we're using hardware shaders, the vertex array works completely different
-	// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
 	if (!usingAcceleratedShader) {
 		vbo.bufferVertsSub(vertices);
+		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
-		glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
+		if (performIndexedRender) {
+			// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
+			hwIndexBuffer->Bind();
+			//glDrawRangeElementsBaseVertex();
+		} else {
+			// When doing non-indexed rendering, just use glDrawArrays
+			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+		}
 	}
-
-	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 }
 
 void RendererGL::display() {
@@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 
 			// Upload vertex data and index buffer data to our GPU
 			accelerateVertexUpload(shaderUnit, accel);
+			performIndexedRender = accel->indexed;
 		}
 	}
 
@@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	}
 
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	gl.bindVAO(hwShaderVAO);
 
 	for (int i = 0; i < totalAttribCount; i++) {
 		const auto& attrib = accel->attributeInfo[i];
@@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 				continue;
 			}
 
-			const u32 attributeSize = attrib.size * vertexCount;
+			glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
+			// TODO: Disable unused attributes as well
+			hwShaderVAO.enableAttribute(i);
 
+			const u32 attributeSize = attrib.size * vertexCount;
 			std::memcpy(vertexData, attrib.data, attributeSize);
+			
 			vertexData += attributeSize;
 		}
 	}
diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
index f4f8b54c..ff6c79f9 100644
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -132,7 +132,7 @@ namespace {
 			const u32 end = GetSyncIndexForOffset(offset);
 			for (; m_used_block_index < end; m_used_block_index++) {
 				if (m_sync_objects[m_used_block_index]) {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
 				}
 
 				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
@@ -149,7 +149,7 @@ namespace {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
 				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
 				}
 
 				WaitForSync(m_sync_objects[m_available_block_index]);