mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-11 08:39:48 +12:00
Disgusting changes to handle the fact that hw shader shaders are 2x as big
This commit is contained in:
parent
44705508ff
commit
37d7bad5aa
6 changed files with 89 additions and 24 deletions
|
@ -6,21 +6,39 @@
|
|||
|
||||
#include "PICA/pica_hash.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader.hpp"
|
||||
#include "bitfield.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
||||
namespace PICA {
|
||||
// Configuration struct used
|
||||
// Configuration struct used
|
||||
struct VertConfig {
|
||||
PICAHash::HashType shaderHash;
|
||||
PICAHash::HashType opdescHash;
|
||||
u32 entrypoint;
|
||||
|
||||
// PICA registers for configuring shader output->fragment semantic mapping
|
||||
std::array<u32, 7> outmaps{};
|
||||
u16 outputMask;
|
||||
u8 outputCount;
|
||||
bool usingUbershader;
|
||||
|
||||
bool operator==(const VertConfig& config) const {
|
||||
// Hash function and equality operator required by std::unordered_map
|
||||
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
|
||||
}
|
||||
|
||||
VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
|
||||
shaderHash = shader.getCodeHash();
|
||||
opdescHash = shader.getOpdescHash();
|
||||
entrypoint = shader.entrypoint;
|
||||
|
||||
outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||
outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
|
||||
for (int i = 0; i < outputCount; i++) {
|
||||
outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i];
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace PICA
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "PICA/gpu.hpp"
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/pica_vert_config.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen_types.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -31,7 +32,7 @@ namespace PICA::ShaderGen {
|
|||
std::string generate(const PICA::FragmentConfig& config);
|
||||
std::string getDefaultVertexShader();
|
||||
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
|
||||
std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
|
||||
std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
|
||||
|
||||
void setTarget(API api, Language language) {
|
||||
this->api = api;
|
||||
|
|
|
@ -150,7 +150,19 @@ void GPU::drawArrays(bool indexed) {
|
|||
}
|
||||
}
|
||||
|
||||
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||
// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
|
||||
// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
|
||||
// of 16 vec4 attributes
|
||||
union PICAVertexBuffer {
|
||||
// Used with CPU shaders
|
||||
std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||
// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
|
||||
std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
|
||||
|
||||
PICAVertexBuffer() {}
|
||||
};
|
||||
|
||||
static PICAVertexBuffer vertexBuffer;
|
||||
|
||||
template <bool indexed, ShaderExecMode mode>
|
||||
void GPU::drawArrays() {
|
||||
|
@ -158,6 +170,10 @@ void GPU::drawArrays() {
|
|||
shaderJIT.prepare(shaderUnit.vs);
|
||||
}
|
||||
|
||||
// We can have up to 16 attributes, each one consisting of 4 floats
|
||||
constexpr u32 maxAttrSizeInFloats = 16 * 4;
|
||||
auto& vertices = vertexBuffer.vertices;
|
||||
|
||||
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
|
||||
|
||||
// Base address for vertex attributes
|
||||
|
@ -228,7 +244,14 @@ void GPU::drawArrays() {
|
|||
size_t tag = vertexIndex % vertexCacheSize;
|
||||
// Cache hit
|
||||
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
|
||||
vertices[i] = vertices[cache.bufferPositions[tag]];
|
||||
if constexpr (mode != ShaderExecMode::Hardware) {
|
||||
vertices[i] = vertices[cache.bufferPositions[tag]];
|
||||
} else {
|
||||
std::memcpy(
|
||||
&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
|
||||
sizeof(float) * maxAttrSizeInFloats
|
||||
);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -361,11 +384,11 @@ void GPU::drawArrays() {
|
|||
}
|
||||
}
|
||||
} else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
|
||||
PICA::Vertex& out = vertices[i];
|
||||
float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
|
||||
for (int j = 0; j < totalAttribCount; j++) {
|
||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||
// Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats
|
||||
std::memcpy(&out.raw[mapping * 4], ¤tAttributes[j], sizeof(vec4f));
|
||||
// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
|
||||
std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -160,7 +160,7 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
|
|||
|
||||
void ShaderDecompiler::writeAttributes() {
|
||||
decompiledShader += R"(
|
||||
layout(location = 0) in vec4 inputs[8];
|
||||
layout(location = 0) in vec4 inputs[16];
|
||||
layout(std140) uniform PICAShaderUniforms {
|
||||
vec4 uniform_float[96];
|
||||
uvec4 uniform_int;
|
||||
|
@ -168,7 +168,7 @@ void ShaderDecompiler::writeAttributes() {
|
|||
};
|
||||
|
||||
vec4 tmp_regs[16];
|
||||
vec4 out_regs[8];
|
||||
vec4 out_regs[16];
|
||||
vec4 dummy_vec = vec4(0.0);
|
||||
bvec2 cmp_reg = bvec2(false);
|
||||
)";
|
||||
|
|
|
@ -671,7 +671,28 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
|
|||
shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
|
||||
}
|
||||
|
||||
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
|
||||
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
|
||||
// First, calculate output register -> Fixed function fragment semantics based on the VAO config
|
||||
{
|
||||
uint count = 0;
|
||||
u16 outputMask = vertConfig.outputMask;
|
||||
std::array<u8, 16> vsOutputRegisters;
|
||||
|
||||
// See which registers are actually enabled and ignore the disabled ones
|
||||
for (int i = 0; i < 16; i++) {
|
||||
if (outputMask & 1) {
|
||||
vsOutputRegisters[count++] = i;
|
||||
}
|
||||
|
||||
outputMask >>= 1;
|
||||
}
|
||||
|
||||
// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
|
||||
for (; count < 16; count++) {
|
||||
vsOutputRegisters[count] = count;
|
||||
}
|
||||
}
|
||||
|
||||
if (usingUbershader) {
|
||||
Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
|
||||
return picaSource;
|
||||
|
@ -704,8 +725,8 @@ void main() {
|
|||
float a_texcoord0_w = out_regs[2].w;
|
||||
vec2 a_texcoord1 = out_regs[3].xy;
|
||||
vec2 a_texcoord2 = out_regs[4].xy;
|
||||
vec3 a_view = out_regs[5].xyz;
|
||||
vec4 a_quaternion = out_regs[6];
|
||||
vec3 a_view = out_regs[2].xyz;
|
||||
vec4 a_quaternion = out_regs[3];
|
||||
|
||||
gl_Position = a_coords;
|
||||
vec4 colourAbs = abs(a_vertexColour);
|
||||
|
@ -722,7 +743,7 @@ void main() {
|
|||
gl_ClipDistance[1] = dot(clipCoords, a_coords);
|
||||
#endif
|
||||
})";
|
||||
|
||||
std::cout << ret << "\n";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,7 +88,7 @@ void RendererGL::initGraphicsContextInternal() {
|
|||
gl.bindUBO(hwShaderUniformUBO);
|
||||
glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
|
||||
|
||||
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
|
||||
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
|
||||
gl.bindVBO(vbo);
|
||||
// Initialize the VAO used when not using hw shaders
|
||||
defaultVAO.create();
|
||||
|
@ -122,8 +122,8 @@ void RendererGL::initGraphicsContextInternal() {
|
|||
// Initialize the VAO used for hw shaders
|
||||
hwShaderVAO.create();
|
||||
gl.bindVAO(hwShaderVAO);
|
||||
for (int attr = 0; attr < 8; attr++) {
|
||||
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4);
|
||||
for (int attr = 0; attr < 16; attr++) {
|
||||
hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
|
||||
hwShaderVAO.enableAttribute(attr);
|
||||
}
|
||||
|
||||
|
@ -495,7 +495,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
|
||||
setupStencilTest(stencilEnable);
|
||||
|
||||
vbo.bufferVertsSub(vertices);
|
||||
// If we're using hardware shaders, the vertex array works completely different
|
||||
// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
|
||||
if (!usingAcceleratedShader) {
|
||||
vbo.bufferVertsSub(vertices);
|
||||
} else {
|
||||
glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
|
||||
}
|
||||
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
}
|
||||
|
||||
|
@ -956,12 +963,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
|
|||
if (usingAcceleratedShader) {
|
||||
auto shaderCodeHash = shaderUnit.vs.getCodeHash();
|
||||
auto opdescHash = shaderUnit.vs.getOpdescHash();
|
||||
auto vertexConfig = PICA::VertConfig{
|
||||
.shaderHash = shaderCodeHash,
|
||||
.opdescHash = opdescHash,
|
||||
.entrypoint = shaderUnit.vs.entrypoint,
|
||||
.usingUbershader = usingUbershader,
|
||||
};
|
||||
PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
|
||||
|
||||
std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
|
||||
// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
|
||||
|
@ -976,7 +978,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
|
|||
// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
|
||||
// it to the GPU
|
||||
if (!picaShaderSource.empty()) {
|
||||
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
|
||||
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
|
||||
shader->create({vertexShaderSource}, OpenGL::Vertex);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue