mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-12 09:09:47 +12:00
Get first renders working with accelerated draws
This commit is contained in:
parent
33e63f7d7a
commit
5432a5a0d8
6 changed files with 63 additions and 78 deletions
|
@ -12,6 +12,7 @@ namespace PICA {
|
||||||
u8* data;
|
u8* data;
|
||||||
u32 offset;
|
u32 offset;
|
||||||
u32 size;
|
u32 size;
|
||||||
|
u32 stride;
|
||||||
|
|
||||||
u8 type;
|
u8 type;
|
||||||
u8 componentCount;
|
u8 componentCount;
|
||||||
|
|
|
@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
|
||||||
bool oldDepthmapEnable = false;
|
bool oldDepthmapEnable = false;
|
||||||
// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
|
// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
|
||||||
bool usingAcceleratedShader = false;
|
bool usingAcceleratedShader = false;
|
||||||
|
bool performIndexedRender = false;
|
||||||
|
|
||||||
// Cached pointer to the current vertex shader when using HW accelerated shaders
|
// Cached pointer to the current vertex shader when using HW accelerated shaders
|
||||||
OpenGL::Shader* generatedVertexShader = nullptr;
|
OpenGL::Shader* generatedVertexShader = nullptr;
|
||||||
|
|
|
@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
||||||
attr.componentCount = size;
|
attr.componentCount = size;
|
||||||
attr.offset = attributeOffset;
|
attr.offset = attributeOffset;
|
||||||
attr.size = size * sizePerComponent[attribType];
|
attr.size = size * sizePerComponent[attribType];
|
||||||
|
attr.stride = attrData.size;
|
||||||
attr.type = attribType;
|
attr.type = attribType;
|
||||||
attr.isPadding = false;
|
attr.isPadding = false;
|
||||||
attributeOffset += attr.size;
|
attributeOffset += attr.size;
|
||||||
|
|
|
@ -120,6 +120,8 @@ void GPU::reset() {
|
||||||
renderer->reset();
|
renderer->reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||||
|
|
||||||
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
|
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
|
||||||
// And whether we are going to use the shader JIT (second template parameter)
|
// And whether we are going to use the shader JIT (second template parameter)
|
||||||
void GPU::drawArrays(bool indexed) {
|
void GPU::drawArrays(bool indexed) {
|
||||||
|
@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
|
||||||
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
|
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
|
||||||
|
|
||||||
if (hwShaders) {
|
if (hwShaders) {
|
||||||
if (indexed) {
|
// Hardware shaders have their own accelerated code path for draws, so they skip everything here
|
||||||
drawArrays<true, ShaderExecMode::Hardware>();
|
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
|
||||||
} else {
|
// Total # of vertices to render
|
||||||
drawArrays<false, ShaderExecMode::Hardware>();
|
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
|
||||||
}
|
|
||||||
|
// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
|
||||||
|
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
|
||||||
} else {
|
} else {
|
||||||
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
|
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
|
||||||
|
|
||||||
|
@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
|
|
||||||
// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
|
|
||||||
// of 16 vec4 attributes
|
|
||||||
union PICAVertexBuffer {
|
|
||||||
// Used with CPU shaders
|
|
||||||
std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
|
||||||
// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
|
|
||||||
std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
|
|
||||||
|
|
||||||
PICAVertexBuffer() {}
|
|
||||||
};
|
|
||||||
|
|
||||||
static PICAVertexBuffer vertexBuffer;
|
|
||||||
|
|
||||||
template <bool indexed, ShaderExecMode mode>
|
template <bool indexed, ShaderExecMode mode>
|
||||||
void GPU::drawArrays() {
|
void GPU::drawArrays() {
|
||||||
if constexpr (mode == ShaderExecMode::JIT) {
|
if constexpr (mode == ShaderExecMode::JIT) {
|
||||||
shaderJIT.prepare(shaderUnit.vs);
|
shaderJIT.prepare(shaderUnit.vs);
|
||||||
|
} else if constexpr (mode == ShaderExecMode::Hardware) {
|
||||||
|
// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
|
||||||
|
Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can have up to 16 attributes, each one consisting of 4 floats
|
// We can have up to 16 attributes, each one consisting of 4 floats
|
||||||
constexpr u32 maxAttrSizeInFloats = 16 * 4;
|
constexpr u32 maxAttrSizeInFloats = 16 * 4;
|
||||||
auto& vertices = vertexBuffer.vertices;
|
|
||||||
|
|
||||||
if constexpr (mode != ShaderExecMode::Hardware) {
|
|
||||||
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Base address for vertex attributes
|
// Base address for vertex attributes
|
||||||
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
|
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
|
||||||
|
@ -257,15 +245,7 @@ void GPU::drawArrays() {
|
||||||
size_t tag = vertexIndex % vertexCacheSize;
|
size_t tag = vertexIndex % vertexCacheSize;
|
||||||
// Cache hit
|
// Cache hit
|
||||||
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
|
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
|
||||||
if constexpr (mode != ShaderExecMode::Hardware) {
|
vertices[i] = vertices[cache.bufferPositions[tag]];
|
||||||
vertices[i] = vertices[cache.bufferPositions[tag]];
|
|
||||||
} else {
|
|
||||||
const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
|
|
||||||
std::memcpy(
|
|
||||||
&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
|
|
||||||
sizeof(float) * maxAttrSizeInFloats
|
|
||||||
);
|
|
||||||
}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -370,39 +350,29 @@ void GPU::drawArrays() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Running shader on the CPU instead of the GPU
|
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
|
||||||
if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
|
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
|
||||||
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
|
// Ie it might map attribute #0 to v2, #1 to v7, etc
|
||||||
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
|
for (int j = 0; j < totalAttribCount; j++) {
|
||||||
// Ie it might map attribute #0 to v2, #1 to v7, etc
|
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||||
for (int j = 0; j < totalAttribCount; j++) {
|
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
||||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
}
|
||||||
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
|
||||||
}
|
|
||||||
|
|
||||||
if constexpr (mode == ShaderExecMode::JIT) {
|
if constexpr (mode == ShaderExecMode::JIT) {
|
||||||
shaderJIT.run(shaderUnit.vs);
|
shaderJIT.run(shaderUnit.vs);
|
||||||
} else {
|
} else {
|
||||||
shaderUnit.vs.run();
|
shaderUnit.vs.run();
|
||||||
}
|
}
|
||||||
|
|
||||||
PICA::Vertex& out = vertices[i];
|
PICA::Vertex& out = vertices[i];
|
||||||
// Map shader outputs to fixed function properties
|
// Map shader outputs to fixed function properties
|
||||||
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||||
for (int i = 0; i < totalShaderOutputs; i++) {
|
for (int i = 0; i < totalShaderOutputs; i++) {
|
||||||
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
|
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
|
||||||
|
|
||||||
for (int j = 0; j < 4; j++) { // pls unroll
|
for (int j = 0; j < 4; j++) { // pls unroll
|
||||||
const u32 mapping = (config >> (j * 8)) & 0x1F;
|
const u32 mapping = (config >> (j * 8)) & 0x1F;
|
||||||
out.raw[mapping] = vsOutputRegisters[i][j];
|
out.raw[mapping] = vsOutputRegisters[i][j];
|
||||||
}
|
|
||||||
}
|
|
||||||
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
|
|
||||||
float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
|
|
||||||
for (int j = 0; j < totalAttribCount; j++) {
|
|
||||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
|
||||||
// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
|
|
||||||
std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
|
||||||
|
|
||||||
// Initialize the VAO used for hw shaders
|
// Initialize the VAO used for hw shaders
|
||||||
hwShaderVAO.create();
|
hwShaderVAO.create();
|
||||||
gl.bindVAO(hwShaderVAO);
|
|
||||||
for (int attr = 0; attr < 16; attr++) {
|
|
||||||
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
|
|
||||||
hwShaderVAO.enableAttribute(attr);
|
|
||||||
}
|
|
||||||
|
|
||||||
dummyVBO.create();
|
dummyVBO.create();
|
||||||
dummyVAO.create();
|
dummyVAO.create();
|
||||||
|
@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
||||||
|
|
||||||
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
|
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
|
||||||
gl.disableScissor();
|
gl.disableScissor();
|
||||||
vbo.bind();
|
|
||||||
gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
|
if (usingAcceleratedShader) {
|
||||||
|
hwVertexBuffer->Bind();
|
||||||
|
gl.bindVAO(hwShaderVAO);
|
||||||
|
} else {
|
||||||
|
vbo.bind();
|
||||||
|
gl.bindVAO(defaultVAO);
|
||||||
|
}
|
||||||
|
|
||||||
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
|
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
|
||||||
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
|
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
|
||||||
|
@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
||||||
|
|
||||||
setupStencilTest(stencilEnable);
|
setupStencilTest(stencilEnable);
|
||||||
|
|
||||||
// If we're using hardware shaders, the vertex array works completely different
|
|
||||||
// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
|
|
||||||
if (!usingAcceleratedShader) {
|
if (!usingAcceleratedShader) {
|
||||||
vbo.bufferVertsSub(vertices);
|
vbo.bufferVertsSub(vertices);
|
||||||
|
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||||
} else {
|
} else {
|
||||||
glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
|
if (performIndexedRender) {
|
||||||
|
// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
|
||||||
|
hwIndexBuffer->Bind();
|
||||||
|
//glDrawRangeElementsBaseVertex();
|
||||||
|
} else {
|
||||||
|
// When doing non-indexed rendering, just use glDrawArrays
|
||||||
|
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RendererGL::display() {
|
void RendererGL::display() {
|
||||||
|
@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
|
||||||
|
|
||||||
// Upload vertex data and index buffer data to our GPU
|
// Upload vertex data and index buffer data to our GPU
|
||||||
accelerateVertexUpload(shaderUnit, accel);
|
accelerateVertexUpload(shaderUnit, accel);
|
||||||
|
performIndexedRender = accel->indexed;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
|
||||||
}
|
}
|
||||||
|
|
||||||
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
|
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
|
||||||
|
|
||||||
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
|
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
|
||||||
|
gl.bindVAO(hwShaderVAO);
|
||||||
|
|
||||||
for (int i = 0; i < totalAttribCount; i++) {
|
for (int i = 0; i < totalAttribCount; i++) {
|
||||||
const auto& attrib = accel->attributeInfo[i];
|
const auto& attrib = accel->attributeInfo[i];
|
||||||
|
@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const u32 attributeSize = attrib.size * vertexCount;
|
glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
|
||||||
|
// TODO: Disable unused attributes as well
|
||||||
|
hwShaderVAO.enableAttribute(i);
|
||||||
|
|
||||||
|
const u32 attributeSize = attrib.size * vertexCount;
|
||||||
std::memcpy(vertexData, attrib.data, attributeSize);
|
std::memcpy(vertexData, attrib.data, attributeSize);
|
||||||
|
|
||||||
vertexData += attributeSize;
|
vertexData += attributeSize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
4
third_party/duckstation/gl/stream_buffer.cpp
vendored
4
third_party/duckstation/gl/stream_buffer.cpp
vendored
|
@ -132,7 +132,7 @@ namespace {
|
||||||
const u32 end = GetSyncIndexForOffset(offset);
|
const u32 end = GetSyncIndexForOffset(offset);
|
||||||
for (; m_used_block_index < end; m_used_block_index++) {
|
for (; m_used_block_index < end; m_used_block_index++) {
|
||||||
if (m_sync_objects[m_used_block_index]) {
|
if (m_sync_objects[m_used_block_index]) {
|
||||||
Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
|
Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
|
||||||
}
|
}
|
||||||
|
|
||||||
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||||
|
@ -149,7 +149,7 @@ namespace {
|
||||||
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
|
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
|
||||||
for (; m_available_block_index < end; m_available_block_index++) {
|
for (; m_available_block_index < end; m_available_block_index++) {
|
||||||
if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
|
if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
|
||||||
Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
|
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
|
||||||
}
|
}
|
||||||
|
|
||||||
WaitForSync(m_sync_objects[m_available_block_index]);
|
WaitForSync(m_sync_objects[m_available_block_index]);
|
||||||
|
|
Loading…
Add table
Reference in a new issue