diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index 72eb8944..6a66cdc1 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -6,32 +6,37 @@ namespace PICA { struct DrawAcceleration { - static constexpr u32 maxAttribCount = 12; + static constexpr u32 maxAttribCount = 16; + static constexpr u32 maxLoaderCount = 12; struct AttributeInfo { - u8* data; u32 offset; - u32 size; u32 stride; - u8 inputReg; // Which input reg should this attribute go to in the vertex shader? u8 type; u8 componentCount; - bool fixed; - bool isPadding; std::array fixedValue; // For fixed attributes }; + struct Loader { + // Data to upload for this loader + u8* data; + usize size; + }; + u8* indexBuffer; // Minimum and maximum index in the index buffer for a draw call u16 minimumIndex, maximumIndex; u32 totalAttribCount; + u32 totalLoaderCount; u32 enabledAttributeMask; + u32 fixedAttributes; u32 vertexDataSize; std::array attributeInfo; + std::array loaders; bool canBeAccelerated; bool indexed; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 84096fb7..a65fd1b5 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -1,5 +1,6 @@ #include "PICA/draw_acceleration.hpp" +#include #include #include "PICA/gpu.hpp" @@ -53,88 +54,94 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); const u64 inputAttrCfg = getVertexShaderInputConfig(); - u32 buffer = 0; u32 attrCount = 0; + u32 loaderOffset = 0; accel.vertexDataSize = 0; + accel.totalLoaderCount = 0; - while (attrCount < totalAttribCount) { - bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0; + for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) { + auto& loaderData = attributeInfo[i]; // Get information for this attribute loader - // Variable attribute attribute - if (!fixedAttrib) { - auto& attrData = attributeInfo[buffer]; // Get information for this attribute - u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) + // This loader is empty, skip it + if (loaderData.componentCount == 0 || loaderData.size == 0) { + continue; + } - if (attrData.componentCount != 0) { - // Size of the attribute in bytes multiplied by the total number of vertices - const u32 bytes = attrData.size * vertexCount; - // Add it to the total vertex data size, aligned to 4 bytes. - accel.vertexDataSize += (bytes + 3) & ~3; + auto& loader = accel.loaders[accel.totalLoaderCount++]; + + // The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading + // Which is equal to maximumIndex - minimumIndex + 1 + const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1); + loader.size = bytes; + + // Add it to the total vertex data size, aligned to 4 bytes. + accel.vertexDataSize += (bytes + 3) & ~3; + + // Get a pointer to the data where this loader's data is stored + const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size); + loader.data = getPointerPhys(loaderAddress); + + u64 attrCfg = loaderData.getConfigFull(); // Get config1 | (config2 << 32) + u32 attributeOffset = 0; + + for (int component = 0; component < loaderData.componentCount; component++) { + uint attributeIndex = (attrCfg >> (component * 4)) & 0xf; // Get index of attribute in vertexCfg + + // Vertex attributes used as padding + // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively + if (attributeIndex >= 12) [[unlikely]] { + Helpers::panic("Padding attribute"); + // Align attribute address up to a 4 byte boundary + attributeOffset = (attributeOffset + 3) & -4; + attributeOffset += (attributeIndex - 11) << 2; + continue; } - u32 attributeOffset = 0; - for (int i = 0; i < attrData.componentCount; i++) { - uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg - auto& attr = accel.attributeInfo[attrCount]; - attr.fixed = false; + const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf; + const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) + const u32 size = (attribInfo >> 2) + 1; // Total number of components - // Vertex attributes used as padding - // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively - if (index >= 12) [[unlikely]] { - Helpers::panic("Padding attribute"); - // Align attribute address up to a 4 byte boundary - attributeOffset = (attributeOffset + 3) & -4; - attributeOffset += (index - 11) << 2; + // Size of each component based on the attribute type + static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf; + // Mark the attribute as enabled + accel.enabledAttributeMask |= 1 << inputReg; - attr.data = nullptr; - attr.isPadding = true; - continue; - } + auto& attr = accel.attributeInfo[inputReg]; + attr.componentCount = size; + attr.offset = attributeOffset + loaderOffset; + attr.stride = loaderData.size; + attr.type = attribType; + attributeOffset += size * sizePerComponent[attribType]; + } - const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; - const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) - const u32 size = (attribInfo >> 2) + 1; // Total number of components - - // Size of each component based on the attribute type - static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; - const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; - // Mark the attribute as enabled - accel.enabledAttributeMask |= 1 << inputReg; + loaderOffset += loader.size; + } - // Get a pointer to the data where this attribute is stored - const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size); + u32 fixedAttributes = fixedAttribMask; + accel.fixedAttributes = 0; - attr.data = getPointerPhys(attrAddress); - attr.inputReg = inputReg; - attr.componentCount = size; - attr.offset = attributeOffset; - attr.size = size * sizePerComponent[attribType]; - attr.stride = attrData.size; - attr.type = attribType; - attr.isPadding = false; - attributeOffset += attr.size; + // Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed + while (fixedAttributes != 0) { + // Get index of next fixed attribute and turn it off + const u32 index = std::countr_zero(fixedAttributes); + const u32 mask = 1u << index; + fixedAttributes ^= mask; - attrCount += 1; - } + // PICA register this fixed attribute is meant to go to + const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf; + const u32 inputRegMask = 1u << inputReg; - buffer += 1; - } else { - vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; - auto& attr = accel.attributeInfo[attrCount]; + // If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute + if ((accel.enabledAttributeMask & inputRegMask) == 0) { + vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index]; + auto& attr = accel.attributeInfo[inputReg]; - attr.fixed = true; - // Set the data pointer to nullptr in order to catch any potential bugs - attr.data = nullptr; - attr.isPadding = false; + accel.fixedAttributes |= inputRegMask; for (int i = 0; i < 4; i++) { attr.fixedValue[i] = fixedAttr[i].toFloat32(); } - - const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; - - attr.inputReg = inputReg; - attrCount += 1; } } diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 2797e09f..2624903f 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -337,8 +337,6 @@ void GPU::drawArrays() { } // Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else - // Corgi does this although I'm not sure if it's actually needed for anything. - // TODO: Find out while (component < 4) { attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0); component++; diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 6447f763..954c30bc 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -508,7 +508,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { if (performIndexedRender) { - // When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw + // When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw hwIndexBuffer->Bind(); glDrawRangeElementsBaseVertex( primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, @@ -1165,12 +1165,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele hwVertexBuffer->Bind(); auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); u8* vertexData = static_cast(vertexBufferRes.pointer); + const u32 vertexBufferOffset = vertexBufferRes.buffer_offset; gl.bindVAO(hwShaderVAO); // Enable or disable vertex attributes as needed const u32 currentAttributeMask = accel->enabledAttributeMask; - // Use bitwise xor to calculate which attributes chanced + // Use bitwise xor to calculate which attributes changed u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask; while (attributeMaskDiff != 0) { @@ -1190,29 +1191,30 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele previousAttributeMask = currentAttributeMask; - for (int i = 0; i < totalAttribCount; i++) { - const auto& attrib = accel->attributeInfo[i]; + // Upload the data for each (enabled) attribute loader into our vertex buffer + for (int i = 0; i < accel->totalLoaderCount; i++) { + auto& loader = accel->loaders[i]; - if (attrib.fixed) { - if ((currentAttributeMask & (1u << i)) == 0) { - glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); - } - } else { - if (attrib.isPadding) [[unlikely]] { - continue; - } - - const u32 attributeSize = attrib.size * vertexCount; - std::memcpy(vertexData, attrib.data, attributeSize); - - vertexData += attributeSize; - - glVertexAttribPointer( - attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, - reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset) - ); - } + std::memcpy(vertexData, loader.data, loader.size); + vertexData += loader.size; } hwVertexBuffer->Unmap(accel->vertexDataSize); + + // Iterate over the 16 PICA input registers and configure how they should be fetched. + for (int i = 0; i < 16; i++) { + const auto& attrib = accel->attributeInfo[i]; + const u32 attributeMask = 1u << i; + + if (accel->fixedAttributes & attributeMask) { + // This is a fixed attribute, so set its fixed value + // TODO: Don't update these if the value does not change, it generates way too many calls + glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); + } else if (accel->enabledAttributeMask & attributeMask) { + glVertexAttribPointer( + i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, + reinterpret_cast(vertexBufferOffset + attrib.offset) + ); + } + } } \ No newline at end of file diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp index ff6c79f9..6fff8b95 100644 --- a/third_party/duckstation/gl/stream_buffer.cpp +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -149,7 +149,7 @@ namespace { const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); for (; m_available_block_index < end; m_available_block_index++) { if (!m_sync_objects[m_used_block_index]) [[unlikely]] { - Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use"); } WaitForSync(m_sync_objects[m_available_block_index]);