Moar shader decompiler (#559)

* Renderer: Add prepareForDraw callback

* Add fmt submodule and port shader decompiler instructions to it

* Add shader acceleration setting

* Hook up vertex shaders to shader cache

* Shader decompiler: Fix redundant compilations

* Shader Decompiler: Fix vertex attribute upload

* Shader compiler: Simplify generated code for reading and faster compilation

* Further simplify shader decompiler output

* Shader decompiler: More smallen-ing

* Shader decompiler: Get PICA uniforms uploaded to the GPU

* Shader decompiler: Readd clipping

* Shader decompiler: Actually `break` on control flow instructions

* Shader decompiler: More control flow handling

* Shader decompiler: Fix desitnation mask

* Shader Decomp: Remove pair member capture in lambda (unsupported on NDK)

* Disgusting changes to handle the fact that hw shader shaders are 2x as big

* Shader decompiler: Implement proper output semantic mapping

* Moar instructions

* Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI

* Shader decompiler: Add register indexing

* Shader decompiler: Optimize mova with both x and y masked

* Shader decompiler: Add DPH/DPHI

* Fix shader caching being broken

* PICA decompiler: Cache VS uniforms

* Simply vertex cache code

* Simplify vertex cache code

* Shader decompiler: Add loops

* Shader decompiler: Implement safe multiplication

* Shader decompiler: Implement LG2/EX2

* Shader decompiler: More control flow

* Shader decompiler: Fix JMPU condition

* Shader decompiler: Convert main function to void

* PICA: Start implementing GPU vertex fetch

* More hw VAO work

* More hw VAO work

* More GPU vertex fetch code

* Add GL Stream Buffer from Duckstation

* GL: Actually upload data to stream buffers

* GPU: Cleanup immediate mode handling

* Get first renders working with accelerated draws

* Shader decompiler: Fix control flow analysis bugs

* HW shaders: Accelerate indexed draws

* Shader decompiler: Add support for compilation errors

* GLSL decompiler: Fall back for LITP

* Add Renderdoc scope classes

* Fix control flow analysis bug

* HW shaders: Fix attribute fetch

* Rewriting hw vertex fetch

* Stream buffer: Fix copy-paste mistake

* HW shaders: Fix indexed rendering

* HW shaders: Add padding attributes

* HW shaders: Avoid redundant glVertexAttrib4f calls

* HW shaders: Fix loops

* HW shaders: Make generated shaders slightly smaller

* Fix libretro build

* HW shaders: Fix android

* Remove redundant ubershader checks

* Set accelerate shader default to true

* Shader decompiler: Don't declare VS input attributes as an array

* Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again

* fix merge conflict bug
This commit is contained in:
wheremyfoodat 2024-10-19 16:53:51 +03:00 committed by GitHub
parent afaf18f124
commit 49a94a13c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 1877 additions and 253 deletions

View file

@ -3,15 +3,20 @@
#include <array>
#include <cstring>
#include <functional>
#include <memory>
#include <optional>
#include <span>
#include <unordered_map>
#include <utility>
#include "PICA/float_types.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_hash.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/pica_vertex.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen.hpp"
#include "gl/stream_buffer.h"
#include "gl_driver.hpp"
#include "gl_state.hpp"
#include "helpers.hpp"
@ -29,9 +34,11 @@ class RendererGL final : public Renderer {
OpenGL::Program triangleProgram;
OpenGL::Program displayProgram;
OpenGL::VertexArray vao;
// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
OpenGL::VertexArray defaultVAO;
// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
OpenGL::VertexArray hwShaderVAO;
OpenGL::VertexBuffer vbo;
bool enableUbershader = true;
// Data
struct {
@ -54,6 +61,21 @@ class RendererGL final : public Renderer {
float oldDepthScale = -1.0;
float oldDepthOffset = 0.0;
bool oldDepthmapEnable = false;
// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
bool usingAcceleratedShader = false;
bool performIndexedRender = false;
bool usingShortIndices = false;
// Set by prepareForDraw, metadata for indexed renders
GLuint minimumIndex = 0;
GLuint maximumIndex = 0;
void* hwIndexBufferOffset = nullptr;
// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
u32 previousAttributeMask = 0;
// Cached pointer to the current vertex shader when using HW accelerated shaders
OpenGL::Shader* generatedVertexShader = nullptr;
SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -71,12 +93,51 @@ class RendererGL final : public Renderer {
// We can compile this once and then link it with all other generated fragment shaders
OpenGL::Shader defaultShadergenVs;
GLuint shadergenFragmentUBO;
// UBO for uploading the PICA uniforms when using hw shaders
GLuint hwShaderUniformUBO;
using StreamBuffer = OpenGLStreamBuffer;
std::unique_ptr<StreamBuffer> hwVertexBuffer;
std::unique_ptr<StreamBuffer> hwIndexBuffer;
// Cache of fixed attribute values so that we don't do any duplicate updates
std::array<std::array<float, 4>, 16> fixedAttrValues;
// Cached recompiled fragment shader
struct CachedProgram {
OpenGL::Program program;
};
std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
struct ShaderCache {
std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
// Program cache indexed by GLuints for the vertex and fragment shader to use
// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
std::unordered_map<u64, CachedProgram> programCache;
void clear() {
for (auto& it : programCache) {
CachedProgram& cachedProgram = it.second;
cachedProgram.program.free();
}
for (auto& it : vertexShaderCache) {
if (it.second.has_value()) {
it.second->free();
}
}
for (auto& it : fragmentShaderCache) {
it.second.free();
}
programCache.clear();
vertexShaderCache.clear();
fragmentShaderCache.clear();
}
};
ShaderCache shaderCache;
OpenGL::Framebuffer getColourFBO();
OpenGL::Texture getTexture(Texture& tex);
@ -95,6 +156,8 @@ class RendererGL final : public Renderer {
void updateFogLUT();
void initGraphicsContextInternal();
void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
public:
RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@ -112,15 +175,13 @@ class RendererGL final : public Renderer {
virtual bool supportsShaderReload() override { return true; }
virtual std::string getUbershader() override;
virtual void setUbershader(const std::string& shader) override;
virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
// Note: The caller is responsible for deleting the currently bound FBO before calling this
void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
void resetStateManager() { gl.reset(); }
void clearShaderCache();
void initUbershader(OpenGL::Program& program);
#ifdef PANDA3DS_FRONTEND_QT