Get first renders working with accelerated draws

This commit is contained in:
wheremyfoodat 2024-08-25 17:14:19 +03:00
parent 33e63f7d7a
commit 5432a5a0d8
6 changed files with 63 additions and 78 deletions

View file

@ -12,6 +12,7 @@ namespace PICA {
u8* data;
u32 offset;
u32 size;
u32 stride;
u8 type;
u8 componentCount;

View file

@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
bool oldDepthmapEnable = false;
// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
bool usingAcceleratedShader = false;
bool performIndexedRender = false;
// Cached pointer to the current vertex shader when using HW accelerated shaders
OpenGL::Shader* generatedVertexShader = nullptr;

View file

@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
attr.componentCount = size;
attr.offset = attributeOffset;
attr.size = size * sizePerComponent[attribType];
attr.stride = attrData.size;
attr.type = attribType;
attr.isPadding = false;
attributeOffset += attr.size;

View file

@ -120,6 +120,8 @@ void GPU::reset() {
renderer->reset();
}
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
// And whether we are going to use the shader JIT (second template parameter)
void GPU::drawArrays(bool indexed) {
@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
if (hwShaders) {
if (indexed) {
drawArrays<true, ShaderExecMode::Hardware>();
} else {
drawArrays<false, ShaderExecMode::Hardware>();
}
// Hardware shaders have their own accelerated code path for draws, so they skip everything here
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
// Total # of vertices to render
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
} else {
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
}
}
// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
// of 16 vec4 attributes
union PICAVertexBuffer {
// Used with CPU shaders
std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
PICAVertexBuffer() {}
};
static PICAVertexBuffer vertexBuffer;
template <bool indexed, ShaderExecMode mode>
void GPU::drawArrays() {
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.prepare(shaderUnit.vs);
} else if constexpr (mode == ShaderExecMode::Hardware) {
// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
}
// We can have up to 16 attributes, each one consisting of 4 floats
constexpr u32 maxAttrSizeInFloats = 16 * 4;
auto& vertices = vertexBuffer.vertices;
if constexpr (mode != ShaderExecMode::Hardware) {
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
}
// Base address for vertex attributes
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@ -257,15 +245,7 @@ void GPU::drawArrays() {
size_t tag = vertexIndex % vertexCacheSize;
// Cache hit
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
if constexpr (mode != ShaderExecMode::Hardware) {
vertices[i] = vertices[cache.bufferPositions[tag]];
} else {
const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
std::memcpy(
&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
sizeof(float) * maxAttrSizeInFloats
);
}
vertices[i] = vertices[cache.bufferPositions[tag]];
continue;
}
@ -370,39 +350,29 @@ void GPU::drawArrays() {
}
}
// Running shader on the CPU instead of the GPU
if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might map attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might map attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();
}
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();
}
PICA::Vertex& out = vertices[i];
// Map shader outputs to fixed function properties
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
for (int i = 0; i < totalShaderOutputs; i++) {
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
PICA::Vertex& out = vertices[i];
// Map shader outputs to fixed function properties
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
for (int i = 0; i < totalShaderOutputs; i++) {
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
for (int j = 0; j < 4; j++) { // pls unroll
const u32 mapping = (config >> (j * 8)) & 0x1F;
out.raw[mapping] = vsOutputRegisters[i][j];
}
}
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
for (int j = 0; j < 4; j++) { // pls unroll
const u32 mapping = (config >> (j * 8)) & 0x1F;
out.raw[mapping] = vsOutputRegisters[i][j];
}
}
}

View file

@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
// Initialize the VAO used for hw shaders
hwShaderVAO.create();
gl.bindVAO(hwShaderVAO);
for (int attr = 0; attr < 16; attr++) {
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
hwShaderVAO.enableAttribute(attr);
}
dummyVBO.create();
dummyVAO.create();
@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
gl.disableScissor();
vbo.bind();
gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
if (usingAcceleratedShader) {
hwVertexBuffer->Bind();
gl.bindVAO(hwShaderVAO);
} else {
vbo.bind();
gl.bindVAO(defaultVAO);
}
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
setupStencilTest(stencilEnable);
// If we're using hardware shaders, the vertex array works completely different
// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
if (!usingAcceleratedShader) {
vbo.bufferVertsSub(vertices);
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
} else {
glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
if (performIndexedRender) {
// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
hwIndexBuffer->Bind();
//glDrawRangeElementsBaseVertex();
} else {
// When doing non-indexed rendering, just use glDrawArrays
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
}
}
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
}
void RendererGL::display() {
@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
// Upload vertex data and index buffer data to our GPU
accelerateVertexUpload(shaderUnit, accel);
performIndexedRender = accel->indexed;
}
}
@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
}
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
gl.bindVAO(hwShaderVAO);
for (int i = 0; i < totalAttribCount; i++) {
const auto& attrib = accel->attributeInfo[i];
@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
continue;
}
const u32 attributeSize = attrib.size * vertexCount;
glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
// TODO: Disable unused attributes as well
hwShaderVAO.enableAttribute(i);
const u32 attributeSize = attrib.size * vertexCount;
std::memcpy(vertexData, attrib.data, attributeSize);
vertexData += attributeSize;
}
}

View file

@ -132,7 +132,7 @@ namespace {
const u32 end = GetSyncIndexForOffset(offset);
for (; m_used_block_index < end; m_used_block_index++) {
if (m_sync_objects[m_used_block_index]) {
Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
}
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
@ -149,7 +149,7 @@ namespace {
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
for (; m_available_block_index < end; m_available_block_index++) {
if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
}
WaitForSync(m_sync_objects[m_available_block_index]);