Disgusting changes to handle the fact that hw shader shaders are 2x as big

This commit is contained in:
wheremyfoodat 2024-07-28 03:38:23 +03:00
parent 44705508ff
commit 37d7bad5aa
6 changed files with 89 additions and 24 deletions

View file

@ -6,21 +6,39 @@
#include "PICA/pica_hash.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader.hpp"
#include "bitfield.hpp"
#include "helpers.hpp"
namespace PICA {
// Configuration struct used
// Configuration struct used
struct VertConfig {
PICAHash::HashType shaderHash;
PICAHash::HashType opdescHash;
u32 entrypoint;
// PICA registers for configuring shader output->fragment semantic mapping
std::array<u32, 7> outmaps{};
u16 outputMask;
u8 outputCount;
bool usingUbershader;
bool operator==(const VertConfig& config) const {
// Hash function and equality operator required by std::unordered_map
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
}
VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
shaderHash = shader.getCodeHash();
opdescHash = shader.getOpdescHash();
entrypoint = shader.entrypoint;
outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
for (int i = 0; i < outputCount; i++) {
outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i];
}
}
};
} // namespace PICA

View file

@ -3,6 +3,7 @@
#include "PICA/gpu.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen_types.hpp"
#include "helpers.hpp"
@ -31,7 +32,7 @@ namespace PICA::ShaderGen {
std::string generate(const PICA::FragmentConfig& config);
std::string getDefaultVertexShader();
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
void setTarget(API api, Language language) {
this->api = api;

View file

@ -150,7 +150,19 @@ void GPU::drawArrays(bool indexed) {
}
}
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
// of 16 vec4 attributes
union PICAVertexBuffer {
// Used with CPU shaders
std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
PICAVertexBuffer() {}
};
static PICAVertexBuffer vertexBuffer;
template <bool indexed, ShaderExecMode mode>
void GPU::drawArrays() {
@ -158,6 +170,10 @@ void GPU::drawArrays() {
shaderJIT.prepare(shaderUnit.vs);
}
// We can have up to 16 attributes, each one consisting of 4 floats
constexpr u32 maxAttrSizeInFloats = 16 * 4;
auto& vertices = vertexBuffer.vertices;
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
// Base address for vertex attributes
@ -228,7 +244,14 @@ void GPU::drawArrays() {
size_t tag = vertexIndex % vertexCacheSize;
// Cache hit
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
vertices[i] = vertices[cache.bufferPositions[tag]];
if constexpr (mode != ShaderExecMode::Hardware) {
vertices[i] = vertices[cache.bufferPositions[tag]];
} else {
std::memcpy(
&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
sizeof(float) * maxAttrSizeInFloats
);
}
continue;
}
@ -361,11 +384,11 @@ void GPU::drawArrays() {
}
}
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
PICA::Vertex& out = vertices[i];
float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
// Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats
std::memcpy(&out.raw[mapping * 4], &currentAttributes[j], sizeof(vec4f));
// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
}
}
}

View file

@ -160,7 +160,7 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
void ShaderDecompiler::writeAttributes() {
decompiledShader += R"(
layout(location = 0) in vec4 inputs[8];
layout(location = 0) in vec4 inputs[16];
layout(std140) uniform PICAShaderUniforms {
vec4 uniform_float[96];
uvec4 uniform_int;
@ -168,7 +168,7 @@ void ShaderDecompiler::writeAttributes() {
};
vec4 tmp_regs[16];
vec4 out_regs[8];
vec4 out_regs[16];
vec4 dummy_vec = vec4(0.0);
bvec2 cmp_reg = bvec2(false);
)";

View file

@ -671,7 +671,28 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
}
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
// First, calculate output register -> Fixed function fragment semantics based on the VAO config
{
uint count = 0;
u16 outputMask = vertConfig.outputMask;
std::array<u8, 16> vsOutputRegisters;
// See which registers are actually enabled and ignore the disabled ones
for (int i = 0; i < 16; i++) {
if (outputMask & 1) {
vsOutputRegisters[count++] = i;
}
outputMask >>= 1;
}
// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
for (; count < 16; count++) {
vsOutputRegisters[count] = count;
}
}
if (usingUbershader) {
Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
return picaSource;
@ -704,8 +725,8 @@ void main() {
float a_texcoord0_w = out_regs[2].w;
vec2 a_texcoord1 = out_regs[3].xy;
vec2 a_texcoord2 = out_regs[4].xy;
vec3 a_view = out_regs[5].xyz;
vec4 a_quaternion = out_regs[6];
vec3 a_view = out_regs[2].xyz;
vec4 a_quaternion = out_regs[3];
gl_Position = a_coords;
vec4 colourAbs = abs(a_vertexColour);
@ -722,7 +743,7 @@ void main() {
gl_ClipDistance[1] = dot(clipCoords, a_coords);
#endif
})";
std::cout << ret << "\n";
return ret;
}
}

View file

@ -88,7 +88,7 @@ void RendererGL::initGraphicsContextInternal() {
gl.bindUBO(hwShaderUniformUBO);
glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
gl.bindVBO(vbo);
// Initialize the VAO used when not using hw shaders
defaultVAO.create();
@ -122,8 +122,8 @@ void RendererGL::initGraphicsContextInternal() {
// Initialize the VAO used for hw shaders
hwShaderVAO.create();
gl.bindVAO(hwShaderVAO);
for (int attr = 0; attr < 8; attr++) {
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4);
for (int attr = 0; attr < 16; attr++) {
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
hwShaderVAO.enableAttribute(attr);
}
@ -495,7 +495,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
setupStencilTest(stencilEnable);
vbo.bufferVertsSub(vertices);
// If we're using hardware shaders, the vertex array works completely different
// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
if (!usingAcceleratedShader) {
vbo.bufferVertsSub(vertices);
} else {
glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
}
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
}
@ -956,12 +963,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
if (usingAcceleratedShader) {
auto shaderCodeHash = shaderUnit.vs.getCodeHash();
auto opdescHash = shaderUnit.vs.getOpdescHash();
auto vertexConfig = PICA::VertConfig{
.shaderHash = shaderCodeHash,
.opdescHash = opdescHash,
.entrypoint = shaderUnit.vs.entrypoint,
.usingUbershader = usingUbershader,
};
PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
@ -976,7 +978,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
// it to the GPU
if (!picaShaderSource.empty()) {
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
shader->create({vertexShaderSource}, OpenGL::Vertex);
}
}