mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-07 14:45:41 +12:00
Get first renders working with accelerated draws
This commit is contained in:
parent
33e63f7d7a
commit
5432a5a0d8
6 changed files with 63 additions and 78 deletions
|
@ -12,6 +12,7 @@ namespace PICA {
|
|||
u8* data;
|
||||
u32 offset;
|
||||
u32 size;
|
||||
u32 stride;
|
||||
|
||||
u8 type;
|
||||
u8 componentCount;
|
||||
|
|
|
@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
|
|||
bool oldDepthmapEnable = false;
|
||||
// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
|
||||
bool usingAcceleratedShader = false;
|
||||
bool performIndexedRender = false;
|
||||
|
||||
// Cached pointer to the current vertex shader when using HW accelerated shaders
|
||||
OpenGL::Shader* generatedVertexShader = nullptr;
|
||||
|
|
|
@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
|||
attr.componentCount = size;
|
||||
attr.offset = attributeOffset;
|
||||
attr.size = size * sizePerComponent[attribType];
|
||||
attr.stride = attrData.size;
|
||||
attr.type = attribType;
|
||||
attr.isPadding = false;
|
||||
attributeOffset += attr.size;
|
||||
|
|
|
@ -120,6 +120,8 @@ void GPU::reset() {
|
|||
renderer->reset();
|
||||
}
|
||||
|
||||
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||
|
||||
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
|
||||
// And whether we are going to use the shader JIT (second template parameter)
|
||||
void GPU::drawArrays(bool indexed) {
|
||||
|
@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
|
|||
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
|
||||
|
||||
if (hwShaders) {
|
||||
if (indexed) {
|
||||
drawArrays<true, ShaderExecMode::Hardware>();
|
||||
} else {
|
||||
drawArrays<false, ShaderExecMode::Hardware>();
|
||||
}
|
||||
// Hardware shaders have their own accelerated code path for draws, so they skip everything here
|
||||
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
|
||||
// Total # of vertices to render
|
||||
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
|
||||
|
||||
// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
|
||||
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
|
||||
} else {
|
||||
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
|
||||
|
||||
|
@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
|
|||
}
|
||||
}
|
||||
|
||||
// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
|
||||
// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
|
||||
// of 16 vec4 attributes
|
||||
union PICAVertexBuffer {
|
||||
// Used with CPU shaders
|
||||
std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||
// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
|
||||
std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
|
||||
|
||||
PICAVertexBuffer() {}
|
||||
};
|
||||
|
||||
static PICAVertexBuffer vertexBuffer;
|
||||
|
||||
template <bool indexed, ShaderExecMode mode>
|
||||
void GPU::drawArrays() {
|
||||
if constexpr (mode == ShaderExecMode::JIT) {
|
||||
shaderJIT.prepare(shaderUnit.vs);
|
||||
} else if constexpr (mode == ShaderExecMode::Hardware) {
|
||||
// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
|
||||
Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
|
||||
}
|
||||
|
||||
// We can have up to 16 attributes, each one consisting of 4 floats
|
||||
constexpr u32 maxAttrSizeInFloats = 16 * 4;
|
||||
auto& vertices = vertexBuffer.vertices;
|
||||
|
||||
if constexpr (mode != ShaderExecMode::Hardware) {
|
||||
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
|
||||
}
|
||||
|
||||
// Base address for vertex attributes
|
||||
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
|
||||
|
@ -257,15 +245,7 @@ void GPU::drawArrays() {
|
|||
size_t tag = vertexIndex % vertexCacheSize;
|
||||
// Cache hit
|
||||
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
|
||||
if constexpr (mode != ShaderExecMode::Hardware) {
|
||||
vertices[i] = vertices[cache.bufferPositions[tag]];
|
||||
} else {
|
||||
const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
|
||||
std::memcpy(
|
||||
&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
|
||||
sizeof(float) * maxAttrSizeInFloats
|
||||
);
|
||||
}
|
||||
vertices[i] = vertices[cache.bufferPositions[tag]];
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -370,39 +350,29 @@ void GPU::drawArrays() {
|
|||
}
|
||||
}
|
||||
|
||||
// Running shader on the CPU instead of the GPU
|
||||
if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
|
||||
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
|
||||
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
|
||||
// Ie it might map attribute #0 to v2, #1 to v7, etc
|
||||
for (int j = 0; j < totalAttribCount; j++) {
|
||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
||||
}
|
||||
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
|
||||
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
|
||||
// Ie it might map attribute #0 to v2, #1 to v7, etc
|
||||
for (int j = 0; j < totalAttribCount; j++) {
|
||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
||||
}
|
||||
|
||||
if constexpr (mode == ShaderExecMode::JIT) {
|
||||
shaderJIT.run(shaderUnit.vs);
|
||||
} else {
|
||||
shaderUnit.vs.run();
|
||||
}
|
||||
if constexpr (mode == ShaderExecMode::JIT) {
|
||||
shaderJIT.run(shaderUnit.vs);
|
||||
} else {
|
||||
shaderUnit.vs.run();
|
||||
}
|
||||
|
||||
PICA::Vertex& out = vertices[i];
|
||||
// Map shader outputs to fixed function properties
|
||||
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||
for (int i = 0; i < totalShaderOutputs; i++) {
|
||||
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
|
||||
PICA::Vertex& out = vertices[i];
|
||||
// Map shader outputs to fixed function properties
|
||||
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||
for (int i = 0; i < totalShaderOutputs; i++) {
|
||||
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
|
||||
|
||||
for (int j = 0; j < 4; j++) { // pls unroll
|
||||
const u32 mapping = (config >> (j * 8)) & 0x1F;
|
||||
out.raw[mapping] = vsOutputRegisters[i][j];
|
||||
}
|
||||
}
|
||||
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
|
||||
float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
|
||||
for (int j = 0; j < totalAttribCount; j++) {
|
||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||
// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
|
||||
std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f));
|
||||
for (int j = 0; j < 4; j++) { // pls unroll
|
||||
const u32 mapping = (config >> (j * 8)) & 0x1F;
|
||||
out.raw[mapping] = vsOutputRegisters[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
|
|||
|
||||
// Initialize the VAO used for hw shaders
|
||||
hwShaderVAO.create();
|
||||
gl.bindVAO(hwShaderVAO);
|
||||
for (int attr = 0; attr < 16; attr++) {
|
||||
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
|
||||
hwShaderVAO.enableAttribute(attr);
|
||||
}
|
||||
|
||||
dummyVBO.create();
|
||||
dummyVAO.create();
|
||||
|
@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
|
||||
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
|
||||
gl.disableScissor();
|
||||
vbo.bind();
|
||||
gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
|
||||
|
||||
if (usingAcceleratedShader) {
|
||||
hwVertexBuffer->Bind();
|
||||
gl.bindVAO(hwShaderVAO);
|
||||
} else {
|
||||
vbo.bind();
|
||||
gl.bindVAO(defaultVAO);
|
||||
}
|
||||
|
||||
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
|
||||
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
|
||||
|
@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
|
||||
setupStencilTest(stencilEnable);
|
||||
|
||||
// If we're using hardware shaders, the vertex array works completely different
|
||||
// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
|
||||
if (!usingAcceleratedShader) {
|
||||
vbo.bufferVertsSub(vertices);
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
} else {
|
||||
glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
|
||||
if (performIndexedRender) {
|
||||
// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
|
||||
hwIndexBuffer->Bind();
|
||||
//glDrawRangeElementsBaseVertex();
|
||||
} else {
|
||||
// When doing non-indexed rendering, just use glDrawArrays
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
}
|
||||
}
|
||||
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
}
|
||||
|
||||
void RendererGL::display() {
|
||||
|
@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
|
|||
|
||||
// Upload vertex data and index buffer data to our GPU
|
||||
accelerateVertexUpload(shaderUnit, accel);
|
||||
performIndexedRender = accel->indexed;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
|
|||
}
|
||||
|
||||
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
|
||||
|
||||
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
|
||||
gl.bindVAO(hwShaderVAO);
|
||||
|
||||
for (int i = 0; i < totalAttribCount; i++) {
|
||||
const auto& attrib = accel->attributeInfo[i];
|
||||
|
@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
|
|||
continue;
|
||||
}
|
||||
|
||||
const u32 attributeSize = attrib.size * vertexCount;
|
||||
glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
|
||||
// TODO: Disable unused attributes as well
|
||||
hwShaderVAO.enableAttribute(i);
|
||||
|
||||
const u32 attributeSize = attrib.size * vertexCount;
|
||||
std::memcpy(vertexData, attrib.data, attributeSize);
|
||||
|
||||
vertexData += attributeSize;
|
||||
}
|
||||
}
|
||||
|
|
4
third_party/duckstation/gl/stream_buffer.cpp
vendored
4
third_party/duckstation/gl/stream_buffer.cpp
vendored
|
@ -132,7 +132,7 @@ namespace {
|
|||
const u32 end = GetSyncIndexForOffset(offset);
|
||||
for (; m_used_block_index < end; m_used_block_index++) {
|
||||
if (m_sync_objects[m_used_block_index]) {
|
||||
Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
|
||||
Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
|
||||
}
|
||||
|
||||
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
|
@ -149,7 +149,7 @@ namespace {
|
|||
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
|
||||
for (; m_available_block_index < end; m_available_block_index++) {
|
||||
if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
|
||||
Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
|
||||
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
|
||||
}
|
||||
|
||||
WaitForSync(m_sync_objects[m_available_block_index]);
|
||||
|
|
Loading…
Add table
Reference in a new issue