diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index 2336493c..61020f76 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -6,6 +6,7 @@ #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" #include "PICA/shader_unit.hpp" +#include "compiler_builtins.hpp" #include "config.hpp" #include "helpers.hpp" #include "logger.hpp" @@ -35,6 +36,12 @@ class GPU { std::array immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission std::array immediateModeVertices; + + // Pointers for the output registers as arranged after GPUREG_VSH_OUTMAP_MASK is applied + std::array vsOutputRegisters; + // Previous value for GPUREG_VSH_OUTMAP_MASK + u32 oldVsOutputMask; + uint immediateModeVertIndex; uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading @@ -167,4 +174,28 @@ class GPU { // We have them in the end of the struct for cache locality reasons. Tl;dr we want the more commonly used things to be packed in the start // Of the struct, instead of externalRegs being in the middle ExternalRegisters externalRegs; + + ALWAYS_INLINE void setVsOutputMask(u32 val) { + val &= 0xffff; + + // Avoid recomputing this if not necessary + if (oldVsOutputMask != val) [[unlikely]] { + oldVsOutputMask = val; + + uint count = 0; + // See which registers are actually enabled and ignore the disabled ones + for (int i = 0; i < 16; i++) { + if (val & 1) { + vsOutputRegisters[count++] = &shaderUnit.vs.outputs[i][0]; + } + + val >>= 1; + } + + // For the others, map the index to a vs output directly (TODO: What does hw actually do?) + for (; count < 16; count++) { + vsOutputRegisters[count] = &shaderUnit.vs.outputs[count][0]; + } + } + } }; diff --git a/include/PICA/regs.hpp b/include/PICA/regs.hpp index 70cecf7b..4342ebe5 100644 --- a/include/PICA/regs.hpp +++ b/include/PICA/regs.hpp @@ -143,6 +143,7 @@ namespace PICA { VertexIntUniform3 = 0x2B4, VertexShaderEntrypoint = 0x2BA, + VertexShaderOutputMask = 0x2BD, VertexShaderTransferEnd = 0x2BF, VertexFloatUniformIndex = 0x2C0, VertexFloatUniformData0 = 0x2C1, diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index c0499382..a777d0a3 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -77,6 +77,9 @@ void GPU::reset() { fixedAttrBuff.fill(0); + oldVsOutputMask = 0; + setVsOutputMask(0xFFFF); + for (auto& e : attributeInfo) { e.offset = 0; e.size = 0; @@ -134,6 +137,8 @@ void GPU::drawArrays() { shaderJIT.prepare(shaderUnit.vs); } + setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; @@ -329,7 +334,7 @@ void GPU::drawArrays() { for (int j = 0; j < 4; j++) { // pls unroll const u32 mapping = (config >> (j * 8)) & 0x1F; - out.raw[mapping] = shaderUnit.vs.outputs[i][j]; + out.raw[mapping] = vsOutputRegisters[i][j]; } } } @@ -338,6 +343,8 @@ void GPU::drawArrays() { } PICA::Vertex GPU::getImmediateModeVertex() { + setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + PICA::Vertex v; const int totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1; @@ -356,7 +363,7 @@ PICA::Vertex GPU::getImmediateModeVertex() { for (int j = 0; j < 4; j++) { // pls unroll const u32 mapping = (config >> (j * 8)) & 0x1F; - v.raw[mapping] = shaderUnit.vs.outputs[i][j]; + v.raw[mapping] = vsOutputRegisters[i][j]; } }