Hook up vertex shaders to shader cache

This commit is contained in:
wheremyfoodat 2024-07-25 04:04:41 +03:00
parent 251ff5ee49
commit 2f4c169cad
10 changed files with 256 additions and 77 deletions

View file

@ -256,6 +256,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
include/PICA/pica_vert_config.hpp
)
cmrc_add_resource_library(

View file

@ -13,6 +13,12 @@
#include "memory.hpp"
#include "renderer.hpp"
enum class ShaderExecMode {
Interpreter, // Interpret shaders on the CPU
JIT, // Recompile shaders to CPU machine code
Hardware, // Recompiler shaders to host shaders and run them on the GPU
};
class GPU {
static constexpr u32 regNum = 0x300;
static constexpr u32 extRegNum = 0x1000;
@ -45,7 +51,7 @@ class GPU {
uint immediateModeVertIndex;
uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
template <bool indexed, bool useShaderJIT>
template <bool indexed, ShaderExecMode mode>
void drawArrays();
// Silly method of avoiding linking problems. TODO: Change to something less silly

View file

@ -0,0 +1,31 @@
#pragma once
#include <array>
#include <cstring>
#include <type_traits>
#include <unordered_map>
#include "PICA/pica_hash.hpp"
#include "PICA/regs.hpp"
#include "bitfield.hpp"
#include "helpers.hpp"
namespace PICA {
// Configuration struct used
struct VertConfig {
PICAHash::HashType shaderHash;
PICAHash::HashType opdescHash;
u32 entrypoint;
bool usingUbershader;
bool operator==(const VertConfig& config) const {
// Hash function and equality operator required by std::unordered_map
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
}
};
} // namespace PICA
// Override std::hash for our vertex config class
template <>
struct std::hash<PICA::VertConfig> {
std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
};

View file

@ -107,6 +107,11 @@ class PICAShader {
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs;
alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
using Hash = PICAHash::HashType;
protected:
std::array<u32, 128> operandDescriptors;
@ -125,11 +130,6 @@ class PICAShader {
std::array<CallInfo, 4> callInfo;
ShaderType type;
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
using Hash = PICAHash::HashType;
Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT)

View file

@ -30,6 +30,8 @@ namespace PICA::ShaderGen {
FragmentGenerator(API api, Language language) : api(api), language(language) {}
std::string generate(const PICA::FragmentConfig& config);
std::string getDefaultVertexShader();
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
void setTarget(API api, Language language) {
this->api = api;

View file

@ -82,7 +82,8 @@ class Renderer {
// This function is called on every draw call before parsing vertex data.
// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
// ubershaders and shadergen, and so on.
virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {}
// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; }
// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
#ifdef PANDA3DS_FRONTEND_QT

View file

@ -3,11 +3,14 @@
#include <array>
#include <cstring>
#include <functional>
#include <optional>
#include <span>
#include <unordered_map>
#include <utility>
#include "PICA/float_types.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/pica_hash.hpp"
#include "PICA/pica_vertex.hpp"
#include "PICA/regs.hpp"
@ -52,6 +55,11 @@ class RendererGL final : public Renderer {
float oldDepthScale = -1.0;
float oldDepthOffset = 0.0;
bool oldDepthmapEnable = false;
// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
bool usingAcceleratedShader = false;
// Cached pointer to the current vertex shader when using HW accelerated shaders
OpenGL::Shader* generatedVertexShader = nullptr;
SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -74,7 +82,38 @@ class RendererGL final : public Renderer {
OpenGL::Program program;
uint uboBinding;
};
std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
struct ShaderCache {
std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
// Program cache indexed by GLuints for the vertex and fragment shader to use
// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
std::unordered_map<u64, CachedProgram> programCache;
void clear() {
for (auto& it : programCache) {
CachedProgram& cachedProgram = it.second;
cachedProgram.program.free();
glDeleteBuffers(1, &cachedProgram.uboBinding);
}
for (auto& it : vertexShaderCache) {
if (it.second.has_value()) {
it.second->free();
}
}
for (auto& it : fragmentShaderCache) {
it.second.free();
}
programCache.clear();
vertexShaderCache.clear();
fragmentShaderCache.clear();
}
};
ShaderCache shaderCache;
OpenGL::Framebuffer getColourFBO();
OpenGL::Texture getTexture(Texture& tex);
@ -109,14 +148,13 @@ class RendererGL final : public Renderer {
virtual bool supportsShaderReload() override { return true; }
virtual std::string getUbershader() override;
virtual void setUbershader(const std::string& shader) override;
virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
// Note: The caller is responsible for deleting the currently bound FBO before calling this
void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
void resetStateManager() { gl.reset(); }
void clearShaderCache();
void initUbershader(OpenGL::Program& program);
#ifdef PANDA3DS_FRONTEND_QT

View file

@ -123,27 +123,38 @@ void GPU::reset() {
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
// And whether we are going to use the shader JIT (second template parameter)
void GPU::drawArrays(bool indexed) {
renderer->prepareForDraw(shaderUnit, false);
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
const bool hwShaders = renderer->prepareForDraw(shaderUnit, false);
if (indexed) {
if (shaderJITEnabled)
drawArrays<true, true>();
else
drawArrays<true, false>();
if (hwShaders) {
if (indexed) {
drawArrays<true, ShaderExecMode::Hardware>();
} else {
drawArrays<false, ShaderExecMode::Hardware>();
}
} else {
if (shaderJITEnabled)
drawArrays<false, true>();
else
drawArrays<false, false>();
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
if (indexed) {
if (shaderJITEnabled) {
drawArrays<true, ShaderExecMode::JIT>();
} else {
drawArrays<true, ShaderExecMode::Interpreter>();
}
} else {
if (shaderJITEnabled) {
drawArrays<false, ShaderExecMode::JIT>();
} else {
drawArrays<false, ShaderExecMode::Interpreter>();
}
}
}
}
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
template <bool indexed, bool useShaderJIT>
template <bool indexed, ShaderExecMode mode>
void GPU::drawArrays() {
if constexpr (useShaderJIT) {
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.prepare(shaderUnit.vs);
}
@ -322,29 +333,38 @@ void GPU::drawArrays() {
}
}
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
// Running shader on the CPU instead of the GPU
if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might map attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
if constexpr (useShaderJIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();
}
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();
}
PICA::Vertex& out = vertices[i];
// Map shader outputs to fixed function properties
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
for (int i = 0; i < totalShaderOutputs; i++) {
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
PICA::Vertex& out = vertices[i];
// Map shader outputs to fixed function properties
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
for (int i = 0; i < totalShaderOutputs; i++) {
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
for (int j = 0; j < 4; j++) { // pls unroll
const u32 mapping = (config >> (j * 8)) & 0x1F;
out.raw[mapping] = vsOutputRegisters[i][j];
for (int j = 0; j < 4; j++) { // pls unroll
const u32 mapping = (config >> (j * 8)) & 0x1F;
out.raw[mapping] = vsOutputRegisters[i][j];
}
}
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
PICA::Vertex& out = vertices[i];
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&out.raw[mapping], &currentAttributes[j], sizeof(vec4f));
}
}
}

View file

@ -72,11 +72,6 @@ std::string FragmentGenerator::getDefaultVertexShader() {
out float gl_ClipDistance[2];
#endif
vec4 abgr8888ToVec4(uint abgr) {
const float scale = 1.0 / 255.0;
return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24));
}
void main() {
gl_Position = a_coords;
vec4 colourAbs = abs(a_vertexColour);
@ -677,4 +672,58 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
shader += "vec2 value = texelFetch(u_tex_luts, ivec2(int(clamped_index), 24), 0).rg;"; // fog LUT is past the light LUTs
shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);";
shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
}
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
if (usingUbershader) {
Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
return picaSource;
} else {
// TODO: Uniforms and don't hardcode fixed-function semantic indices...
std::string ret = picaSource;
if (api == API::GLES) {
ret += "\n#define USING_GLES\n";
}
ret += R"(
out vec4 v_quaternion;
out vec4 v_colour;
out vec3 v_texcoord0;
out vec2 v_texcoord1;
out vec3 v_view;
out vec2 v_texcoord2;
#ifndef USING_GLES
out float gl_ClipDistance[2];
#endif
void main() {
pica_shader_main();
vec4 a_coords = output_registers[0];
vec4 a_vertexColour = output_registers[1];
vec2 a_texcoord0 = output_registers[2].xy;
float a_texcoord0_w = output_registers[2].w;
vec2 a_texcoord1 = output_registers[3].xy;
vec2 a_texcoord2 = output_registers[4].xy;
vec3 a_view = output_registers[5].xyz;
vec4 a_quaternion = output_registers[6];
gl_Position = a_coords;
vec4 colourAbs = abs(a_vertexColour);
v_colour = min(colourAbs, vec4(1.f));
v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
v_view = a_view;
v_quaternion = a_quaternion;
#ifndef USING_GLES
//gl_ClipDistance[0] = -a_coords.z;
//gl_ClipDistance[1] = dot(clipCoords, a_coords);
#endif
})";
return ret;
}
}

View file

@ -25,7 +25,7 @@ void RendererGL::reset() {
colourBufferCache.reset();
textureCache.reset();
clearShaderCache();
shaderCache.clear();
// Init the colour/depth buffer settings to some random defaults on reset
colourBufferLoc = 0;
@ -788,18 +788,24 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
PICA::FragmentConfig fsConfig(regs);
CachedProgram& programEntry = shaderCache[fsConfig];
OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
if (!fragShader.exists()) {
std::string fs = fragShaderGen.generate(fsConfig);
fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
}
// Get the handle of the current vertex shader
OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
// And form the key for looking up a shader program
const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
CachedProgram& programEntry = shaderCache.programCache[programKey];
OpenGL::Program& program = programEntry.program;
if (!program.exists()) {
std::string fs = fragShaderGen.generate(fsConfig);
OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
program.create({defaultShadergenVs, fragShader});
program.create({vertexShader, fragShader});
gl.useProgram(program);
fragShader.free();
// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
@ -904,15 +910,8 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
return program;
}
void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
std::string vertShaderSource = PICA::ShaderGen::decompileShader(
shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
);
OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex);
//triangleProgram.create({vert, frag});
std::cout << vertShaderSource << "\n";
bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
// First we figure out if we will be using an ubershader
bool usingUbershader = emulatorConfig->useUbershaders;
if (usingUbershader) {
const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
@ -925,6 +924,46 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
}
}
// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
// TODO: Ubershader support for accelerated shaders
usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader;
if (usingAcceleratedShader) {
auto shaderCodeHash = shaderUnit.vs.getCodeHash();
auto opdescHash = shaderUnit.vs.getOpdescHash();
auto vertexConfig = PICA::VertConfig{
.shaderHash = shaderCodeHash,
.opdescHash = opdescHash,
.entrypoint = shaderUnit.vs.entrypoint,
.usingUbershader = usingUbershader,
};
std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
if (!shader.has_value()) {
// Initialize shader to a "null" shader (handle == 0)
*shader = OpenGL::Shader();
std::string picaShaderSource = PICA::ShaderGen::decompileShader(
shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
);
// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
// it to the GPU
if (!picaShaderSource.empty()) {
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
shader->create({vertexShaderSource}, OpenGL::Vertex);
}
}
// Shader generation did not work out, so set usingAcceleratedShader to false
if (!shader->exists()) {
usingAcceleratedShader = false;
} else {
generatedVertexShader = &(*shader);
}
}
if (usingUbershader) {
gl.useProgram(triangleProgram);
} else {
@ -958,6 +997,8 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
setupUbershaderTexEnv();
}
return usingAcceleratedShader;
}
void RendererGL::screenshot(const std::string& name) {
@ -985,22 +1026,12 @@ void RendererGL::screenshot(const std::string& name) {
stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
}
void RendererGL::clearShaderCache() {
for (auto& shader : shaderCache) {
CachedProgram& cachedProgram = shader.second;
cachedProgram.program.free();
glDeleteBuffers(1, &cachedProgram.uboBinding);
}
shaderCache.clear();
}
void RendererGL::deinitGraphicsContext() {
// Invalidate all surface caches since they'll no longer be valid
textureCache.reset();
depthBufferCache.reset();
colourBufferCache.reset();
clearShaderCache();
shaderCache.clear();
// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@ -1048,4 +1079,4 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
}
}