Add GL Stream Buffer from Duckstation

This commit is contained in:
wheremyfoodat 2024-08-25 01:47:02 +03:00
parent a8b30ee2dc
commit e34bdb6841
7 changed files with 461 additions and 2 deletions

View file

@ -138,6 +138,7 @@ include_directories(${SDL2_INCLUDE_DIR})
include_directories(third_party/toml11)
include_directories(third_party/glm)
include_directories(third_party/renderdoc)
include_directories(third_party/duckstation)
add_subdirectory(third_party/cmrc)
@ -302,6 +303,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
include/align.hpp
)
cmrc_add_resource_library(
@ -334,7 +336,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
endif()
if(ENABLE_QT_GUI)
include_directories(third_party/duckstation)
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)
if(APPLE)
@ -377,6 +378,8 @@ if(ENABLE_OPENGL)
src/host_shaders/opengl_fragment_shader.frag
)
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})

99
include/align.hpp Normal file
View file

@ -0,0 +1,99 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include <cstdlib>
#include "helpers.hpp"
#ifdef _MSC_VER
#include <malloc.h>
#endif
namespace Common {
template <typename T>
constexpr bool isAligned(T value, unsigned int alignment) {
return (value % static_cast<T>(alignment)) == 0;
}
template <typename T>
constexpr T alignUp(T value, unsigned int alignment) {
return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
}
template <typename T>
constexpr T alignDown(T value, unsigned int alignment) {
return value / static_cast<T>(alignment) * static_cast<T>(alignment);
}
template <typename T>
constexpr bool isAlignedPow2(T value, unsigned int alignment) {
return (value & static_cast<T>(alignment - 1)) == 0;
}
template <typename T>
constexpr T alignUpPow2(T value, unsigned int alignment) {
return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
}
template <typename T>
constexpr T alignDownPow2(T value, unsigned int alignment) {
return value & static_cast<T>(~static_cast<T>(alignment - 1));
}
template <typename T>
constexpr bool isPow2(T value) {
return (value & (value - 1)) == 0;
}
template <typename T>
constexpr T previousPow2(T value) {
if (value == static_cast<T>(0)) return 0;
value |= (value >> 1);
value |= (value >> 2);
value |= (value >> 4);
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
return value - (value >> 1);
}
template <typename T>
constexpr T nextPow2(T value) {
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
if (value == static_cast<T>(0)) return 0;
value--;
value |= (value >> 1);
value |= (value >> 2);
value |= (value >> 4);
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
value++;
return value;
}
ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
#ifdef _MSC_VER
return _aligned_malloc(size, alignment);
#else
// Unaligned sizes are slow on macOS.
#ifdef __APPLE__
if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
#endif
void* ret = nullptr;
return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
#endif
}
ALWAYS_INLINE static void alignedFree(void* ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
} // namespace Common

View file

@ -3,6 +3,7 @@
#include <array>
#include <cstring>
#include <functional>
#include <memory>
#include <optional>
#include <span>
#include <unordered_map>
@ -10,11 +11,12 @@
#include "PICA/float_types.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/pica_hash.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/pica_vertex.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen.hpp"
#include "gl/stream_buffer.h"
#include "gl_state.hpp"
#include "helpers.hpp"
#include "logger.hpp"
@ -83,6 +85,10 @@ class RendererGL final : public Renderer {
// UBO for uploading the PICA uniforms when using hw shaders
GLuint hwShaderUniformUBO;
using StreamBuffer = OpenGLStreamBuffer;
std::unique_ptr<StreamBuffer> hwVertexBuffer;
std::unique_ptr<StreamBuffer> hwIndexBuffer;
// Cached recompiled fragment shader
struct CachedProgram {
OpenGL::Program program;

View file

@ -82,6 +82,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
// Align attribute address up to a 4 byte boundary
attributeOffset = (attributeOffset + 3) & -4;
attributeOffset += (index - 11) << 2;
attr.data = nullptr;
continue;
}

View file

@ -78,6 +78,14 @@ void RendererGL::initGraphicsContextInternal() {
gl.useProgram(displayProgram);
glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object
// Create stream buffers for vertex, index and uniform buffers
// TODO: Remove buffers from GL state tracking as the StreamBuffer implementation bypasses the state tracker.
static constexpr usize hwIndexBufferSize = 2_MB;
static constexpr usize hwVertexBufferSize = 16_MB;
hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
// Allocate memory for the shadergen fragment uniform UBO
glGenBuffers(1, &shadergenFragmentUBO);
gl.bindUBO(shadergenFragmentUBO);

View file

@ -0,0 +1,288 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "gl/stream_buffer.h"
#include <array>
#include <cstdio>
#include "align.hpp"
OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
#ifdef GPU_DEBUG_INFO
if (glObjectLabel) {
glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
}
#endif
}
namespace {
// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
public:
~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
u32 Unmap(u32 used_size) override {
if (used_size == 0) return 0;
glBindBuffer(m_target, m_buffer_id);
glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
return 0;
}
u32 GetChunkSize() const override { return m_size; }
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
}
private:
BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
}
u8* m_cpu_buffer;
};
// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
public:
~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
u32 Unmap(u32 used_size) override {
if (used_size == 0) return 0;
glBindBuffer(m_target, m_buffer_id);
glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
return 0;
}
u32 GetChunkSize() const override { return m_size; }
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
}
private:
BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
}
u8* m_cpu_buffer;
};
// Base class for implementations which require syncing.
class SyncingStreamBuffer : public OpenGLStreamBuffer {
public:
enum : u32 { NUM_SYNC_POINTS = 16 };
virtual ~SyncingStreamBuffer() override {
for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
glDeleteSync(m_sync_objects[i]);
}
}
protected:
SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
const u32 end = GetSyncIndexForOffset(offset);
for (; m_used_block_index < end; m_used_block_index++) {
if (m_sync_objects[m_used_block_index]) {
Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
}
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
}
ALWAYS_INLINE void WaitForSync(GLsync& sync) {
glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(sync);
sync = nullptr;
}
ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
for (; m_available_block_index < end; m_available_block_index++) {
if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
}
WaitForSync(m_sync_objects[m_available_block_index]);
}
}
void AllocateSpace(u32 size) {
// add sync objects for writes since the last allocation
AddSyncsForOffset(m_position);
// wait for sync objects for the space we want to use
EnsureSyncsWaitedForOffset(m_position + size);
// wrap-around?
if ((m_position + size) > m_size) {
// current position ... buffer end
AddSyncsForOffset(m_size);
// rewind, and try again
m_position = 0;
// wait for the sync at the start of the buffer
WaitForSync(m_sync_objects[0]);
m_available_block_index = 1;
// and however much more we need to satisfy the allocation
EnsureSyncsWaitedForOffset(size);
m_used_block_index = 0;
}
}
u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
u32 m_position = 0;
u32 m_used_block_index = 0;
u32 m_available_block_index = NUM_SYNC_POINTS;
u32 m_bytes_per_block;
std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
};
class BufferStorageStreamBuffer : public SyncingStreamBuffer {
public:
~BufferStorageStreamBuffer() override {
glBindBuffer(m_target, m_buffer_id);
glUnmapBuffer(m_target);
glBindBuffer(m_target, 0);
}
MappingResult Map(u32 alignment, u32 min_size) override {
if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
AllocateSpace(min_size);
if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
}
const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
}
u32 Unmap(u32 used_size) override {
if ((m_position + used_size) > m_size) [[unlikely]] {
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
}
if (!m_coherent) {
if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
} else {
Bind();
glFlushMappedBufferRange(m_target, m_position, used_size);
}
}
const u32 prev_position = m_position;
m_position += used_size;
return prev_position;
}
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
glBufferStorage(target, size, nullptr, flags);
else if (GLAD_GL_EXT_buffer_storage)
glBufferStorageEXT(target, size, nullptr, flags);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
AssertMsg(mapped_ptr, "Persistent buffer was mapped");
return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
}
private:
BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
u8* m_mapped_ptr;
bool m_coherent;
};
} // namespace
std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
std::unique_ptr<OpenGLStreamBuffer> buf;
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
buf = BufferStorageStreamBuffer::Create(target, size);
if (buf) return buf;
}
// BufferSubData is slower on all drivers except NVIDIA...
#if 0
const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
// Mali and Adreno drivers can't do sub-buffer tracking...
return BufferDataStreamBuffer::Create(target, size);
}
return BufferSubDataStreamBuffer::Create(target, size);
#else
return BufferDataStreamBuffer::Create(target, size);
#endif
}

View file

@ -0,0 +1,53 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include <glad/gl.h>
// Comment to avoid clang-format reordering the glad header
#include <memory>
#include <string_view>
#include <tuple>
#include <vector>
#include "duckstation_compat.h"
#include "helpers.hpp"
class OpenGLStreamBuffer {
public:
virtual ~OpenGLStreamBuffer();
ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
ALWAYS_INLINE u32 GetSize() const { return m_size; }
void Bind();
void Unbind();
void SetDebugName(std::string_view name);
struct MappingResult {
void* pointer;
u32 buffer_offset;
u32 index_aligned; // offset / alignment, suitable for base vertex
u32 space_aligned; // remaining space / alignment
};
virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
/// Returns the position in the buffer *before* the start of used_size.
virtual u32 Unmap(u32 used_size) = 0;
/// Returns the minimum granularity of blocks which sync objects will be created around.
virtual u32 GetChunkSize() const = 0;
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
protected:
OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
GLenum m_target;
GLuint m_buffer_id;
u32 m_size;
};