Add GL Stream Buffer from Duckstation

2025-07-08 16:18:41 +12:00 · 2024-08-25 01:47:02 +03:00 · 2024-08-25 01:47:02 +03:00 · e34bdb6841
commit e34bdb6841
parent a8b30ee2dc
7 changed files with 461 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -138,6 +138,7 @@ include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 include_directories(third_party/glm)
 include_directories(third_party/renderdoc)
+include_directories(third_party/duckstation)

 add_subdirectory(third_party/cmrc)

@ -302,6 +303,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                 include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                 include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
+                 include/align.hpp
 )

 cmrc_add_resource_library(
@ -334,7 +336,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
 endif()

 if(ENABLE_QT_GUI)
-    include_directories(third_party/duckstation)
    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)

    if(APPLE)
@ -377,6 +378,8 @@ if(ENABLE_OPENGL)
        src/host_shaders/opengl_fragment_shader.frag
    )

+    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
+
    set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
    source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})

--- a/include/align.hpp
+++ b/include/align.hpp
@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _MSC_VER
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@ -3,6 +3,7 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <memory>
 #include <optional>
 #include <span>
 #include <unordered_map>
@ -10,11 +11,12 @@

 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
-#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@ -83,6 +85,10 @@ class RendererGL final : public Renderer {
 	// UBO for uploading the PICA uniforms when using hw shaders
 	GLuint hwShaderUniformUBO;

+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@ -82,6 +82,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 					// Align attribute address up to a 4 byte boundary
 					attributeOffset = (attributeOffset + 3) & -4;
 					attributeOffset += (index - 11) << 2;
+
+					attr.data = nullptr;
 					continue;
 				}

--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -78,6 +78,14 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.useProgram(displayProgram);
 	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object

+	// Create stream buffers for vertex, index and uniform buffers
+	// TODO: Remove buffers from GL state tracking as the StreamBuffer implementation bypasses the state tracker.
+	static constexpr usize hwIndexBufferSize = 2_MB;
+	static constexpr usize hwVertexBufferSize = 16_MB;
+
+	hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
+	hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
+
 	// Allocate memory for the shadergen fragment uniform UBO
 	glGenBuffers(1, &shadergenFragmentUBO);
 	gl.bindUBO(shadergenFragmentUBO);
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@ -0,0 +1,288 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gl/stream_buffer.h"
+
+#include <array>
+#include <cstdio>
+
+#include "align.hpp"
+
+OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
+OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
+
+void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
+void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
+
+void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
+#ifdef GPU_DEBUG_INFO
+	if (glObjectLabel) {
+		glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
+	}
+#endif
+}
+
+namespace {
+	// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
+	class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
+	class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Base class for implementations which require syncing.
+	class SyncingStreamBuffer : public OpenGLStreamBuffer {
+	  public:
+		enum : u32 { NUM_SYNC_POINTS = 16 };
+
+		virtual ~SyncingStreamBuffer() override {
+			for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
+				glDeleteSync(m_sync_objects[i]);
+			}
+		}
+
+	  protected:
+		SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+			: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
+
+		ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
+
+		ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
+			const u32 end = GetSyncIndexForOffset(offset);
+			for (; m_used_block_index < end; m_used_block_index++) {
+				if (m_sync_objects[m_used_block_index]) {
+					Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
+				}
+
+				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}
+		}
+
+		ALWAYS_INLINE void WaitForSync(GLsync& sync) {
+			glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+			glDeleteSync(sync);
+			sync = nullptr;
+		}
+
+		ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
+			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
+			for (; m_available_block_index < end; m_available_block_index++) {
+				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
+					Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
+				}
+
+				WaitForSync(m_sync_objects[m_available_block_index]);
+			}
+		}
+
+		void AllocateSpace(u32 size) {
+			// add sync objects for writes since the last allocation
+			AddSyncsForOffset(m_position);
+
+			// wait for sync objects for the space we want to use
+			EnsureSyncsWaitedForOffset(m_position + size);
+
+			// wrap-around?
+			if ((m_position + size) > m_size) {
+				// current position ... buffer end
+				AddSyncsForOffset(m_size);
+
+				// rewind, and try again
+				m_position = 0;
+
+				// wait for the sync at the start of the buffer
+				WaitForSync(m_sync_objects[0]);
+				m_available_block_index = 1;
+
+				// and however much more we need to satisfy the allocation
+				EnsureSyncsWaitedForOffset(size);
+				m_used_block_index = 0;
+			}
+		}
+
+		u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
+
+		u32 m_position = 0;
+		u32 m_used_block_index = 0;
+		u32 m_available_block_index = NUM_SYNC_POINTS;
+		u32 m_bytes_per_block;
+		std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
+	};
+
+	class BufferStorageStreamBuffer : public SyncingStreamBuffer {
+	  public:
+		~BufferStorageStreamBuffer() override {
+			glBindBuffer(m_target, m_buffer_id);
+			glUnmapBuffer(m_target);
+			glBindBuffer(m_target, 0);
+		}
+
+		MappingResult Map(u32 alignment, u32 min_size) override {
+			if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
+
+			AllocateSpace(min_size);
+			if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
+			return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
+		}
+
+		u32 Unmap(u32 used_size) override {
+			if ((m_position + used_size) > m_size) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			if (!m_coherent) {
+				if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
+					glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
+				} else {
+					Bind();
+					glFlushMappedBufferRange(m_target, m_position, used_size);
+				}
+			}
+
+			const u32 prev_position = m_position;
+			m_position += used_size;
+			return prev_position;
+		}
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+
+			const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+			const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
+			if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
+				glBufferStorage(target, size, nullptr, flags);
+			else if (GLAD_GL_EXT_buffer_storage)
+				glBufferStorageEXT(target, size, nullptr, flags);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
+			AssertMsg(mapped_ptr, "Persistent buffer was mapped");
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
+		}
+
+	  private:
+		BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
+			: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
+
+		u8* m_mapped_ptr;
+		bool m_coherent;
+	};
+
+}  // namespace
+
+std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
+	std::unique_ptr<OpenGLStreamBuffer> buf;
+	if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
+		buf = BufferStorageStreamBuffer::Create(target, size);
+		if (buf) return buf;
+	}
+
+	// BufferSubData is slower on all drivers except NVIDIA...
+#if 0
+	const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+	if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
+		// Mali and Adreno drivers can't do sub-buffer tracking...
+		return BufferDataStreamBuffer::Create(target, size);
+	}
+
+	return BufferSubDataStreamBuffer::Create(target, size);
+#else
+	return BufferDataStreamBuffer::Create(target, size);
+#endif
+}
--- a/third_party/duckstation/gl/stream_buffer.h
+++ b/third_party/duckstation/gl/stream_buffer.h
@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <glad/gl.h>
+// Comment to avoid clang-format reordering the glad header
+
+#include <memory>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include "duckstation_compat.h"
+#include "helpers.hpp"
+
+class OpenGLStreamBuffer {
+  public:
+	virtual ~OpenGLStreamBuffer();
+
+	ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+	ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+	ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+	void Bind();
+	void Unbind();
+
+	void SetDebugName(std::string_view name);
+
+	struct MappingResult {
+		void* pointer;
+		u32 buffer_offset;
+		u32 index_aligned;  // offset / alignment, suitable for base vertex
+		u32 space_aligned;  // remaining space / alignment
+	};
+
+	virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
+
+	/// Returns the position in the buffer *before* the start of used_size.
+	virtual u32 Unmap(u32 used_size) = 0;
+
+	/// Returns the minimum granularity of blocks which sync objects will be created around.
+	virtual u32 GetChunkSize() const = 0;
+
+	static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
+
+  protected:
+	OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+	GLenum m_target;
+	GLuint m_buffer_id;
+	u32 m_size;
+};