From 5432a5a0d87ed17a81b7ac865f8b06413b893821 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:14:19 +0300 Subject: [PATCH] Get first renders working with accelerated draws --- include/PICA/draw_acceleration.hpp | 1 + include/renderer_gl/renderer_gl.hpp | 1 + src/core/PICA/draw_acceleration.cpp | 1 + src/core/PICA/gpu.cpp | 96 +++++++------------- src/core/renderer_gl/renderer_gl.cpp | 38 +++++--- third_party/duckstation/gl/stream_buffer.cpp | 4 +- 6 files changed, 63 insertions(+), 78 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index 2ec3f318..1671825e 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -12,6 +12,7 @@ namespace PICA { u8* data; u32 offset; u32 size; + u32 stride; u8 type; u8 componentCount; diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 16286484..b643534a 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -62,6 +62,7 @@ class RendererGL final : public Renderer { bool oldDepthmapEnable = false; // Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader bool usingAcceleratedShader = false; + bool performIndexedRender = false; // Cached pointer to the current vertex shader when using HW accelerated shaders OpenGL::Shader* generatedVertexShader = nullptr; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 22b1f041..7646577f 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.componentCount = size; attr.offset = attributeOffset; attr.size = size * sizePerComponent[attribType]; + attr.stride = attrData.size; attr.type = attribType; attr.isPadding = false; attributeOffset += attr.size; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 64dc5beb..dad24a22 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -120,6 +120,8 @@ void GPU::reset() { renderer->reset(); } +static std::array vertices; + // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { @@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) { const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel); if (hwShaders) { - if (indexed) { - drawArrays(); - } else { - drawArrays(); - } + // Hardware shaders have their own accelerated code path for draws, so they skip everything here + const PICA::PrimType primType = static_cast(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig])); + // Total # of vertices to render + const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; + + // Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching + renderer->drawVertices(primType, std::span(vertices).first(vertexCount)); } else { const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; @@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) { } } -// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer, -// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist -// of 16 vec4 attributes -union PICAVertexBuffer { - // Used with CPU shaders - std::array vertices; - // Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats - std::array vsInputs; - - PICAVertexBuffer() {} -}; - -static PICAVertexBuffer vertexBuffer; - template void GPU::drawArrays() { if constexpr (mode == ShaderExecMode::JIT) { shaderJIT.prepare(shaderUnit.vs); + } else if constexpr (mode == ShaderExecMode::Hardware) { + // Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path + Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!"); } // We can have up to 16 attributes, each one consisting of 4 floats constexpr u32 maxAttrSizeInFloats = 16 * 4; - auto& vertices = vertexBuffer.vertices; - - if constexpr (mode != ShaderExecMode::Hardware) { - setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); - } // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible @@ -257,15 +245,7 @@ void GPU::drawArrays() { size_t tag = vertexIndex % vertexCacheSize; // Cache hit if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) { - if constexpr (mode != ShaderExecMode::Hardware) { - vertices[i] = vertices[cache.bufferPositions[tag]]; - } else { - const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats; - std::memcpy( - &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition], - sizeof(float) * maxAttrSizeInFloats - ); - } + vertices[i] = vertices[cache.bufferPositions[tag]]; continue; } @@ -370,39 +350,29 @@ void GPU::drawArrays() { } } - // Running shader on the CPU instead of the GPU - if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) { - // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers - // Based on the SH_ATTRIBUTES_PERMUTATION registers. - // Ie it might map attribute #0 to v2, #1 to v7, etc - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); - } + // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers + // Based on the SH_ATTRIBUTES_PERMUTATION registers. + // Ie it might map attribute #0 to v2, #1 to v7, etc + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); + } - if constexpr (mode == ShaderExecMode::JIT) { - shaderJIT.run(shaderUnit.vs); - } else { - shaderUnit.vs.run(); - } + if constexpr (mode == ShaderExecMode::JIT) { + shaderJIT.run(shaderUnit.vs); + } else { + shaderUnit.vs.run(); + } - PICA::Vertex& out = vertices[i]; - // Map shader outputs to fixed function properties - const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; - for (int i = 0; i < totalShaderOutputs; i++) { - const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + PICA::Vertex& out = vertices[i]; + // Map shader outputs to fixed function properties + const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; + for (int i = 0; i < totalShaderOutputs; i++) { + const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; - for (int j = 0; j < 4; j++) { // pls unroll - const u32 mapping = (config >> (j * 8)) & 0x1F; - out.raw[mapping] = vsOutputRegisters[i][j]; - } - } - } else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly - float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats]; - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - // Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats - std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f)); + for (int j = 0; j < 4; j++) { // pls unroll + const u32 mapping = (config >> (j * 8)) & 0x1F; + out.raw[mapping] = vsOutputRegisters[i][j]; } } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index fc6e2ce6..82248d53 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() { // Initialize the VAO used for hw shaders hwShaderVAO.create(); - gl.bindVAO(hwShaderVAO); - for (int attr = 0; attr < 16; attr++) { - hwShaderVAO.setAttributeFloat(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4); - hwShaderVAO.enableAttribute(attr); - } dummyVBO.create(); dummyVAO.create(); @@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); - vbo.bind(); - gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO); + + if (usingAcceleratedShader) { + hwVertexBuffer->Bind(); + gl.bindVAO(hwShaderVAO); + } else { + vbo.bind(); + gl.bindVAO(defaultVAO); + } gl.enableClipPlane(0); // Clipping plane 0 is always enabled if (regs[PICA::InternalRegs::ClipEnable] & 1) { @@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v setupStencilTest(stencilEnable); - // If we're using hardware shaders, the vertex array works completely different - // And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability. if (!usingAcceleratedShader) { vbo.bufferVertsSub(vertices); + OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { - glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data()); + if (performIndexedRender) { + // When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw + hwIndexBuffer->Bind(); + //glDrawRangeElementsBaseVertex(); + } else { + // When doing non-indexed rendering, just use glDrawArrays + OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); + } } - - OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } void RendererGL::display() { @@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* // Upload vertex data and index buffer data to our GPU accelerateVertexUpload(shaderUnit, accel); + performIndexedRender = accel->indexed; } } @@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele } auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); + u8* vertexData = static_cast(vertexBufferRes.pointer); + gl.bindVAO(hwShaderVAO); for (int i = 0; i < totalAttribCount; i++) { const auto& attrib = accel->attributeInfo[i]; @@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele continue; } - const u32 attributeSize = attrib.size * vertexCount; + glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset)); + // TODO: Disable unused attributes as well + hwShaderVAO.enableAttribute(i); + const u32 attributeSize = attrib.size * vertexCount; std::memcpy(vertexData, attrib.data, attributeSize); + vertexData += attributeSize; } } diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp index f4f8b54c..ff6c79f9 100644 --- a/third_party/duckstation/gl/stream_buffer.cpp +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -132,7 +132,7 @@ namespace { const u32 end = GetSyncIndexForOffset(offset); for (; m_used_block_index < end; m_used_block_index++) { if (m_sync_objects[m_used_block_index]) { - Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use"); } m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); @@ -149,7 +149,7 @@ namespace { const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); for (; m_available_block_index < end; m_available_block_index++) { if (!m_sync_objects[m_used_block_index]) [[unlikely]] { - Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use"); } WaitForSync(m_sync_objects[m_available_block_index]);