Disgusting changes to handle the fact that hw shader shaders are 2x as big

2025-07-12 18:28:30 +12:00 · 2024-07-28 03:38:23 +03:00 · 2024-07-28 03:38:23 +03:00 · 37d7bad5aa
commit 37d7bad5aa
parent 44705508ff
6 changed files with 89 additions and 24 deletions
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@ -6,21 +6,39 @@

 #include "PICA/pica_hash.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
 #include "bitfield.hpp"
 #include "helpers.hpp"

 namespace PICA {
-	// Configuration struct used 
+	// Configuration struct used
 	struct VertConfig {
 		PICAHash::HashType shaderHash;
 		PICAHash::HashType opdescHash;
 		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
 		bool usingUbershader;

 		bool operator==(const VertConfig& config) const {
 			// Hash function and equality operator required by std::unordered_map
 			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
 		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+			}
+		}
 	};
 }  // namespace PICA

--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@ -3,6 +3,7 @@

 #include "PICA/gpu.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen_types.hpp"
 #include "helpers.hpp"
@ -31,7 +32,7 @@ namespace PICA::ShaderGen {
 		std::string generate(const PICA::FragmentConfig& config);
 		std::string getDefaultVertexShader();
 		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
-		std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);

 		void setTarget(API api, Language language) {
 			this->api = api;
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@ -150,7 +150,19 @@ void GPU::drawArrays(bool indexed) {
 	}
 }

-static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
+// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
+// of 16 vec4 attributes
+union PICAVertexBuffer {
+	// Used with CPU shaders
+	std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+	// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
+	std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
+
+	PICAVertexBuffer() {}
+};
+
+static PICAVertexBuffer vertexBuffer;

 template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
@ -158,6 +170,10 @@ void GPU::drawArrays() {
 		shaderJIT.prepare(shaderUnit.vs);
 	}

+	// We can have up to 16 attributes, each one consisting of 4 floats
+	constexpr u32 maxAttrSizeInFloats = 16 * 4;
+	auto& vertices = vertexBuffer.vertices;
+
 	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);

 	// Base address for vertex attributes
@ -228,7 +244,14 @@ void GPU::drawArrays() {
 			size_t tag = vertexIndex % vertexCacheSize;
 			// Cache hit
 			if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
-				vertices[i] = vertices[cache.bufferPositions[tag]];
+				if constexpr (mode != ShaderExecMode::Hardware) {
+					vertices[i] = vertices[cache.bufferPositions[tag]];
+				} else {
+					std::memcpy(
+						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
+						sizeof(float) * maxAttrSizeInFloats
+					);
+				}
 				continue;
 			}

@ -361,11 +384,11 @@ void GPU::drawArrays() {
 				}
 			}
 		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
-			PICA::Vertex& out = vertices[i];
+			float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
 			for (int j = 0; j < totalAttribCount; j++) {
 				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				// Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats
-				std::memcpy(&out.raw[mapping * 4], &currentAttributes[j], sizeof(vec4f));
+				// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
+				std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
 			}
 		}
 	}
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@ -160,7 +160,7 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {

 void ShaderDecompiler::writeAttributes() {
 	decompiledShader += R"(
-	layout(location = 0) in vec4 inputs[8];
+	layout(location = 0) in vec4 inputs[16];
 	layout(std140) uniform PICAShaderUniforms {
 		vec4 uniform_float[96];
 		uvec4 uniform_int;
@ -168,7 +168,7 @@ void ShaderDecompiler::writeAttributes() {
 	};

 	vec4 tmp_regs[16];
-	vec4 out_regs[8];
+	vec4 out_regs[16];
 	vec4 dummy_vec = vec4(0.0);
 	bvec2 cmp_reg = bvec2(false);
 )";
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@ -671,7 +671,28 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }

-std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+		std::array<u8, 16> vsOutputRegisters;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+	}
+
 	if (usingUbershader) {
 		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
 		return picaSource;
@ -704,8 +725,8 @@ void main() {
 	float a_texcoord0_w = out_regs[2].w;
 	vec2 a_texcoord1 = out_regs[3].xy;
 	vec2 a_texcoord2 = out_regs[4].xy;
-	vec3 a_view = out_regs[5].xyz;
-	vec4 a_quaternion = out_regs[6];
+	vec3 a_view = out_regs[2].xyz;
+	vec4 a_quaternion = out_regs[3];

 	gl_Position = a_coords;
 	vec4 colourAbs = abs(a_vertexColour);
@ -722,7 +743,7 @@ void main() {
 	gl_ClipDistance[1] = dot(clipCoords, a_coords);
 #endif
 })";
-
+		std::cout << ret << "\n";
 		return ret;
 	}
 }
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@ -88,7 +88,7 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.bindUBO(hwShaderUniformUBO);
 	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);

-	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
+	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
 	// Initialize the VAO used when not using hw shaders
 	defaultVAO.create();
@ -122,8 +122,8 @@ void RendererGL::initGraphicsContextInternal() {
 	// Initialize the VAO used for hw shaders
 	hwShaderVAO.create();
 	gl.bindVAO(hwShaderVAO);
-	for (int attr = 0; attr < 8; attr++) {
-		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4);
+	for (int attr = 0; attr < 16; attr++) {
+		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
 		hwShaderVAO.enableAttribute(attr);
 	}

@ -495,7 +495,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v

 	setupStencilTest(stencilEnable);

-	vbo.bufferVertsSub(vertices);
+	// If we're using hardware shaders, the vertex array works completely different
+	// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
+	if (!usingAcceleratedShader) {
+		vbo.bufferVertsSub(vertices);
+	} else {
+		glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
+	}
+
 	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 }

@ -956,12 +963,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 	if (usingAcceleratedShader) {
 		auto shaderCodeHash = shaderUnit.vs.getCodeHash();
 		auto opdescHash = shaderUnit.vs.getOpdescHash();
-		auto vertexConfig = PICA::VertConfig{
-			.shaderHash = shaderCodeHash,
-			.opdescHash = opdescHash,
-			.entrypoint = shaderUnit.vs.entrypoint,
-			.usingUbershader = usingUbershader,
-		};
+		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);

 		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
 		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
@ -976,7 +978,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 			// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
 			// it to the GPU
 			if (!picaShaderSource.empty()) {
-				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
 				shader->create({vertexShaderSource}, OpenGL::Vertex);
 			}
 		}