Rewriting hw vertex fetch

This commit is contained in:
wheremyfoodat 2024-09-04 03:18:39 +03:00
parent 15b6a9e2d9
commit 4a39b06262
5 changed files with 107 additions and 95 deletions

View file

@ -6,32 +6,37 @@
namespace PICA { namespace PICA {
struct DrawAcceleration { struct DrawAcceleration {
static constexpr u32 maxAttribCount = 12; static constexpr u32 maxAttribCount = 16;
static constexpr u32 maxLoaderCount = 12;
struct AttributeInfo { struct AttributeInfo {
u8* data;
u32 offset; u32 offset;
u32 size;
u32 stride; u32 stride;
u8 inputReg; // Which input reg should this attribute go to in the vertex shader?
u8 type; u8 type;
u8 componentCount; u8 componentCount;
bool fixed;
bool isPadding;
std::array<float, 4> fixedValue; // For fixed attributes std::array<float, 4> fixedValue; // For fixed attributes
}; };
struct Loader {
// Data to upload for this loader
u8* data;
usize size;
};
u8* indexBuffer; u8* indexBuffer;
// Minimum and maximum index in the index buffer for a draw call // Minimum and maximum index in the index buffer for a draw call
u16 minimumIndex, maximumIndex; u16 minimumIndex, maximumIndex;
u32 totalAttribCount; u32 totalAttribCount;
u32 totalLoaderCount;
u32 enabledAttributeMask; u32 enabledAttributeMask;
u32 fixedAttributes;
u32 vertexDataSize; u32 vertexDataSize;
std::array<AttributeInfo, maxAttribCount> attributeInfo; std::array<AttributeInfo, maxAttribCount> attributeInfo;
std::array<Loader, maxLoaderCount> loaders;
bool canBeAccelerated; bool canBeAccelerated;
bool indexed; bool indexed;

View file

@ -1,5 +1,6 @@
#include "PICA/draw_acceleration.hpp" #include "PICA/draw_acceleration.hpp"
#include <bit>
#include <limits> #include <limits>
#include "PICA/gpu.hpp" #include "PICA/gpu.hpp"
@ -53,88 +54,94 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
const u64 inputAttrCfg = getVertexShaderInputConfig(); const u64 inputAttrCfg = getVertexShaderInputConfig();
u32 buffer = 0;
u32 attrCount = 0; u32 attrCount = 0;
u32 loaderOffset = 0;
accel.vertexDataSize = 0; accel.vertexDataSize = 0;
accel.totalLoaderCount = 0;
while (attrCount < totalAttribCount) { for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0; auto& loaderData = attributeInfo[i]; // Get information for this attribute loader
// Variable attribute attribute // This loader is empty, skip it
if (!fixedAttrib) { if (loaderData.componentCount == 0 || loaderData.size == 0) {
auto& attrData = attributeInfo[buffer]; // Get information for this attribute continue;
u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) }
if (attrData.componentCount != 0) { auto& loader = accel.loaders[accel.totalLoaderCount++];
// Size of the attribute in bytes multiplied by the total number of vertices
const u32 bytes = attrData.size * vertexCount; // The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
// Add it to the total vertex data size, aligned to 4 bytes. // Which is equal to maximumIndex - minimumIndex + 1
accel.vertexDataSize += (bytes + 3) & ~3; const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
loader.size = bytes;
// Add it to the total vertex data size, aligned to 4 bytes.
accel.vertexDataSize += (bytes + 3) & ~3;
// Get a pointer to the data where this loader's data is stored
const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
loader.data = getPointerPhys<u8>(loaderAddress);
u64 attrCfg = loaderData.getConfigFull(); // Get config1 | (config2 << 32)
u32 attributeOffset = 0;
for (int component = 0; component < loaderData.componentCount; component++) {
uint attributeIndex = (attrCfg >> (component * 4)) & 0xf; // Get index of attribute in vertexCfg
// Vertex attributes used as padding
// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
if (attributeIndex >= 12) [[unlikely]] {
Helpers::panic("Padding attribute");
// Align attribute address up to a 4 byte boundary
attributeOffset = (attributeOffset + 3) & -4;
attributeOffset += (attributeIndex - 11) << 2;
continue;
} }
u32 attributeOffset = 0; const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
for (int i = 0; i < attrData.componentCount; i++) { const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float)
uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg const u32 size = (attribInfo >> 2) + 1; // Total number of components
auto& attr = accel.attributeInfo[attrCount];
attr.fixed = false;
// Vertex attributes used as padding // Size of each component based on the attribute type
// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
if (index >= 12) [[unlikely]] { const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
Helpers::panic("Padding attribute"); // Mark the attribute as enabled
// Align attribute address up to a 4 byte boundary accel.enabledAttributeMask |= 1 << inputReg;
attributeOffset = (attributeOffset + 3) & -4;
attributeOffset += (index - 11) << 2;
attr.data = nullptr; auto& attr = accel.attributeInfo[inputReg];
attr.isPadding = true; attr.componentCount = size;
continue; attr.offset = attributeOffset + loaderOffset;
} attr.stride = loaderData.size;
attr.type = attribType;
attributeOffset += size * sizePerComponent[attribType];
}
const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; loaderOffset += loader.size;
const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) }
const u32 size = (attribInfo >> 2) + 1; // Total number of components
// Size of each component based on the attribute type
static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
// Mark the attribute as enabled
accel.enabledAttributeMask |= 1 << inputReg;
// Get a pointer to the data where this attribute is stored u32 fixedAttributes = fixedAttribMask;
const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size); accel.fixedAttributes = 0;
attr.data = getPointerPhys<u8>(attrAddress); // Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
attr.inputReg = inputReg; while (fixedAttributes != 0) {
attr.componentCount = size; // Get index of next fixed attribute and turn it off
attr.offset = attributeOffset; const u32 index = std::countr_zero<u32>(fixedAttributes);
attr.size = size * sizePerComponent[attribType]; const u32 mask = 1u << index;
attr.stride = attrData.size; fixedAttributes ^= mask;
attr.type = attribType;
attr.isPadding = false;
attributeOffset += attr.size;
attrCount += 1; // PICA register this fixed attribute is meant to go to
} const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
const u32 inputRegMask = 1u << inputReg;
buffer += 1; // If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
} else { if ((accel.enabledAttributeMask & inputRegMask) == 0) {
vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
auto& attr = accel.attributeInfo[attrCount]; auto& attr = accel.attributeInfo[inputReg];
attr.fixed = true; accel.fixedAttributes |= inputRegMask;
// Set the data pointer to nullptr in order to catch any potential bugs
attr.data = nullptr;
attr.isPadding = false;
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
attr.fixedValue[i] = fixedAttr[i].toFloat32(); attr.fixedValue[i] = fixedAttr[i].toFloat32();
} }
const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
attr.inputReg = inputReg;
attrCount += 1;
} }
} }

View file

@ -337,8 +337,6 @@ void GPU::drawArrays() {
} }
// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else // Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
// Corgi does this although I'm not sure if it's actually needed for anything.
// TODO: Find out
while (component < 4) { while (component < 4) {
attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0); attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
component++; component++;

View file

@ -508,7 +508,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
} else { } else {
if (performIndexedRender) { if (performIndexedRender) {
// When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw // When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
hwIndexBuffer->Bind(); hwIndexBuffer->Bind();
glDrawRangeElementsBaseVertex( glDrawRangeElementsBaseVertex(
primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
@ -1165,12 +1165,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
hwVertexBuffer->Bind(); hwVertexBuffer->Bind();
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer); u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
gl.bindVAO(hwShaderVAO); gl.bindVAO(hwShaderVAO);
// Enable or disable vertex attributes as needed // Enable or disable vertex attributes as needed
const u32 currentAttributeMask = accel->enabledAttributeMask; const u32 currentAttributeMask = accel->enabledAttributeMask;
// Use bitwise xor to calculate which attributes chanced // Use bitwise xor to calculate which attributes changed
u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask; u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
while (attributeMaskDiff != 0) { while (attributeMaskDiff != 0) {
@ -1190,29 +1191,30 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
previousAttributeMask = currentAttributeMask; previousAttributeMask = currentAttributeMask;
for (int i = 0; i < totalAttribCount; i++) { // Upload the data for each (enabled) attribute loader into our vertex buffer
const auto& attrib = accel->attributeInfo[i]; for (int i = 0; i < accel->totalLoaderCount; i++) {
auto& loader = accel->loaders[i];
if (attrib.fixed) { std::memcpy(vertexData, loader.data, loader.size);
if ((currentAttributeMask & (1u << i)) == 0) { vertexData += loader.size;
glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
}
} else {
if (attrib.isPadding) [[unlikely]] {
continue;
}
const u32 attributeSize = attrib.size * vertexCount;
std::memcpy(vertexData, attrib.data, attributeSize);
vertexData += attributeSize;
glVertexAttribPointer(
attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset)
);
}
} }
hwVertexBuffer->Unmap(accel->vertexDataSize); hwVertexBuffer->Unmap(accel->vertexDataSize);
// Iterate over the 16 PICA input registers and configure how they should be fetched.
for (int i = 0; i < 16; i++) {
const auto& attrib = accel->attributeInfo[i];
const u32 attributeMask = 1u << i;
if (accel->fixedAttributes & attributeMask) {
// This is a fixed attribute, so set its fixed value
// TODO: Don't update these if the value does not change, it generates way too many calls
glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
} else if (accel->enabledAttributeMask & attributeMask) {
glVertexAttribPointer(
i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
);
}
}
} }

View file

@ -149,7 +149,7 @@ namespace {
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
for (; m_available_block_index < end; m_available_block_index++) { for (; m_available_block_index < end; m_available_block_index++) {
if (!m_sync_objects[m_used_block_index]) [[unlikely]] { if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use"); Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
} }
WaitForSync(m_sync_objects[m_available_block_index]); WaitForSync(m_sync_objects[m_available_block_index]);