From 9e32b6d4bf93b2f62808002905e0e59e96aa61b6 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Mon, 10 Jul 2023 08:55:23 -0700 Subject: [PATCH] Remove OpenGL-specific vector-types Removes dependency on the OpenGL header and rendering backen for its `OpenGL::Vector` type in favor of a more standard array. --- .../PICA/dynapica/shader_rec_emitter_x64.hpp | 36 +++-- include/PICA/shader.hpp | 133 ++++++++--------- src/core/PICA/regs.cpp | 131 ++++++++--------- src/core/PICA/shader_interpreter.cpp | 136 +++++++++--------- src/core/PICA/shader_unit.cpp | 9 +- 5 files changed, 221 insertions(+), 224 deletions(-) diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp index ba37595a..109fddac 100644 --- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp +++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp @@ -2,17 +2,17 @@ // Only do anything if we're on an x64 target with JIT support enabled #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST) -#include "helpers.hpp" -#include "logger.hpp" -#include "PICA/shader.hpp" -#include "xbyak/xbyak.h" -#include "xbyak/xbyak_util.h" -#include "x64_regs.hpp" - #include +#include "PICA/shader.hpp" +#include "helpers.hpp" +#include "logger.hpp" +#include "x64_regs.hpp" +#include "xbyak/xbyak.h" +#include "xbyak/xbyak_util.h" + class ShaderEmitter : public Xbyak::CodeGenerator { - static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader + static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader // Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size static constexpr size_t allocSize = executableMemorySize + 0x1000; @@ -20,7 +20,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator { static constexpr uint noSwizzle = 0x1B; using f24 = Floats::f24; - using vec4f = OpenGL::Vector; + using vec4f = std::array; // An array of labels (incl pointers) to each compiled (to x64) PICA instruction std::array instructionLabels; @@ -33,8 +33,8 @@ class ShaderEmitter : public Xbyak::CodeGenerator { // Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i) Label onesVector; - u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ - u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) + u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ + u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) bool haveSSE4_1 = false; // Shows if the CPU supports SSE4.1 bool haveAVX = false; // Shows if the CPU supports AVX (NOT AVX2, NOT AVX512. Regular AVX) @@ -105,10 +105,10 @@ class ShaderEmitter : public Xbyak::CodeGenerator { MAKE_LOG_FUNCTION(log, shaderJITLogger) -public: - using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions + public: + using InstructionCallback = const void (*)(PICAShader& shaderUnit); // Callback type used for instructions // Callback type used for the JIT prologue. This is what the caller will call - using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb); + using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb); PrologueCallback prologueCb = nullptr; // Initialize our emitter with "allocSize" bytes of RWX memory @@ -123,7 +123,7 @@ public: Helpers::panic("This CPU does not support SSE3. Please use the shader interpreter instead"); } } - + void compile(const PICAShader& shaderUnit); // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does @@ -133,9 +133,7 @@ public: return reinterpret_cast(ptr); } - PrologueCallback getPrologueCallback() { - return prologueCb; - } + PrologueCallback getPrologueCallback() { return prologueCb; } }; -#endif // x64 recompiler check \ No newline at end of file +#endif // x64 recompiler check \ No newline at end of file diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index ad1e0e46..06d529c9 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -2,14 +2,12 @@ #include #include #include -#include "helpers.hpp" -#include "opengl.hpp" + #include "PICA/float_types.hpp" #include "PICA/pica_hash.hpp" +#include "helpers.hpp" -enum class ShaderType { - Vertex, Geometry -}; +enum class ShaderType { Vertex, Geometry }; namespace ShaderOpcodes { enum : u32 { @@ -46,66 +44,66 @@ namespace ShaderOpcodes { SETEMIT = 0x2B, JMPC = 0x2C, JMPU = 0x2D, - CMP1 = 0x2E, // Both of these instructions are CMP + CMP1 = 0x2E, // Both of these instructions are CMP CMP2 = 0x2F, - MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it + MAD = 0x38 // Everything between 0x38-0x3F is a MAD but fuck it }; } // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT class PICAShader { using f24 = Floats::f24; - using vec4f = OpenGL::Vector; + using vec4f = std::array; struct Loop { - u32 startingPC; // PC at the start of the loop - u32 endingPC; // PC at the end of the loop - u32 iterations; // How many iterations of the loop to run - u32 increment; // How much to increment the loop counter after each iteration + u32 startingPC; // PC at the start of the loop + u32 endingPC; // PC at the end of the loop + u32 iterations; // How many iterations of the loop to run + u32 increment; // How much to increment the loop counter after each iteration }; // Info for ifc/ifu stack struct ConditionalInfo { - u32 endingPC; // PC at the end of the if block (= DST) - u32 newPC; // PC after the if block is done executing (= DST + NUM) + u32 endingPC; // PC at the end of the if block (= DST) + u32 newPC; // PC after the if block is done executing (= DST + NUM) }; struct CallInfo { - u32 endingPC; // PC at the end of the function - u32 returnPC; // PC to return to after the function ends + u32 endingPC; // PC at the end of the function + u32 returnPC; // PC to return to after the function ends }; - int bufferIndex; // Index of the next instruction to overwrite for shader uploads - int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite - u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range) - u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer? - bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform? + int bufferIndex; // Index of the next instruction to overwrite for shader uploads + int opDescriptorIndex; // Index of the next operand descriptor we'll overwrite + u32 floatUniformIndex = 0; // Which float uniform are we writing to? ([0, 95] range) + u32 floatUniformWordCount = 0; // How many words have we buffered for the current uniform transfer? + bool f32UniformTransfer = false; // Are we transferring an f32 uniform or an f24 uniform? - std::array floatUniformBuffer; // Buffer for temporarily caching float uniform data + std::array floatUniformBuffer; // Buffer for temporarily caching float uniform data -public: + public: // These are placed close to the temp registers and co because it helps the JIT generate better code - u32 entrypoint = 0; // Initial shader PC + u32 entrypoint = 0; // Initial shader PC u32 boolUniform; - std::array, 4> intUniforms; + std::array, 4> intUniforms; alignas(16) std::array floatUniforms; - alignas(16) std::array fixedAttributes; // Fixed vertex attributes - alignas(16) std::array inputs; // Attributes passed to the shader + alignas(16) std::array fixedAttributes; // Fixed vertex attributes + alignas(16) std::array inputs; // Attributes passed to the shader alignas(16) std::array outputs; - alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT + alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT -protected: + protected: std::array operandDescriptors; - alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values - OpenGL::Vector addrRegister; // Address register - bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in + alignas(16) std::array tempRegisters; // General purpose registers the shader can use for temp values + std::array addrRegister; // Address register + bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in u32 loopCounter; - u32 pc = 0; // Program counter: Index of the next instruction we're going to execute - u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full) - u32 ifIndex = 0; // The index of our IF stack - u32 callIndex = 0; // The index of our CALL stack + u32 pc = 0; // Program counter: Index of the next instruction we're going to execute + u32 loopIndex = 0; // The index of our loop stack (0 = empty, 4 = full) + u32 ifIndex = 0; // The index of our IF stack + u32 callIndex = 0; // The index of our CALL stack std::array loopInfo; std::array conditionalInfo; @@ -117,7 +115,7 @@ protected: // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first using Hash = PICAHash::HashType; - Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) + Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT) bool codeHashDirty = false; @@ -130,7 +128,7 @@ protected: vec4f getSource(u32 source); vec4f& getDest(u32 dest); -private: + private: // Interpreter functions for the various shader functions void add(u32 instruction); void call(u32 instruction); @@ -171,13 +169,13 @@ private: bool negate; using namespace Helpers; - if constexpr (sourceIndex == 1) { // SRC1 + if constexpr (sourceIndex == 1) { // SRC1 negate = (getBit<4>(opDescriptor)) != 0; compSwizzle = getBits<5, 8>(opDescriptor); - } else if constexpr (sourceIndex == 2) { // SRC2 + } else if constexpr (sourceIndex == 2) { // SRC2 negate = (getBit<13>(opDescriptor)) != 0; compSwizzle = getBits<14, 8>(opDescriptor); - } else if constexpr (sourceIndex == 3) { // SRC3 + } else if constexpr (sourceIndex == 3) { // SRC3 negate = (getBit<22>(opDescriptor)) != 0; compSwizzle = getBits<23, 8>(opDescriptor); } @@ -185,8 +183,8 @@ private: // Iterate through every component of the swizzled vector in reverse order // And get which source component's index to match it with for (int comp = 0; comp < 4; comp++) { - int index = compSwizzle & 3; // Get index for this component - compSwizzle >>= 2; // Move to next component index + int index = compSwizzle & 3; // Get index for this component + compSwizzle >>= 2; // Move to next component index ret[3 - comp] = source[index]; } @@ -212,39 +210,33 @@ private: u8 getIndexedSource(u32 source, u32 index); bool isCondTrue(u32 instruction); -public: + public: static constexpr size_t maxInstructionCount = 4096; - std::array loadedShader; // Currently loaded & active shader - std::array bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to + std::array loadedShader; // Currently loaded & active shader + std::array bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to PICAShader(ShaderType type) : type(type) {} // Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them - void finalize() { - std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); - } + void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); } - void setBufferIndex(u32 index) { - bufferIndex = index & 0xfff; - } + void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; } - void setOpDescriptorIndex(u32 index) { - opDescriptorIndex = index & 0x7f; - } + void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; } void uploadWord(u32 word) { if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew"); bufferedShader[bufferIndex++] = word; bufferIndex &= 0xfff; - codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed + codeHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed } void uploadDescriptor(u32 word) { operandDescriptors[opDescriptorIndex++] = word; opDescriptorIndex &= 0x7f; - opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed + opdescHashDirty = true; // Signal the JIT if necessary that the program hash has potentially changed } void setFloatUniformIndex(u32 word) { @@ -255,23 +247,22 @@ public: void uploadFloatUniform(u32 word) { floatUniformBuffer[floatUniformWordCount++] = word; - if (floatUniformIndex >= 96) - Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex); + if (floatUniformIndex >= 96) Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex); if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) { vec4f& uniform = floatUniforms[floatUniformIndex++]; floatUniformWordCount = 0; if (f32UniformTransfer) { - uniform.x() = f24::fromFloat32(*(float*)&floatUniformBuffer[3]); - uniform.y() = f24::fromFloat32(*(float*)&floatUniformBuffer[2]); - uniform.z() = f24::fromFloat32(*(float*)&floatUniformBuffer[1]); - uniform.w() = f24::fromFloat32(*(float*)&floatUniformBuffer[0]); + uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]); + uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]); + uniform[2] = f24::fromFloat32(*(float*)&floatUniformBuffer[1]); + uniform[3] = f24::fromFloat32(*(float*)&floatUniformBuffer[0]); } else { - uniform.x() = f24::fromRaw(floatUniformBuffer[2] & 0xffffff); - uniform.y() = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24)); - uniform.z() = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); - uniform.w() = f24::fromRaw(floatUniformBuffer[0] >> 8); + uniform[0] = f24::fromRaw(floatUniformBuffer[2] & 0xffffff); + uniform[1] = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24)); + uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); + uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8); } } } @@ -280,10 +271,10 @@ public: using namespace Helpers; auto& u = intUniforms[index]; - u.x() = word & 0xff; - u.y() = getBits<8, 8>(word); - u.z() = getBits<16, 8>(word); - u.w() = getBits<24, 8>(word); + u[0] = word & 0xff; + u[1] = getBits<8, 8>(word); + u[2] = getBits<16, 8>(word); + u[3] = getBits<24, 8>(word); } void run(); diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index f62040dd..bbffa99a 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -1,11 +1,12 @@ -#include "PICA/gpu.hpp" #include "PICA/regs.hpp" +#include "PICA/gpu.hpp" + using namespace Floats; using namespace Helpers; u32 GPU::readReg(u32 address) { - if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers + if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers const u32 index = (address - 0x1EF01000) / sizeof(u32); return readInternalReg(index); } else { @@ -15,7 +16,7 @@ u32 GPU::readReg(u32 address) { } void GPU::writeReg(u32 address, u32 value) { - if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers + if (address >= 0x1EF01000 && address < 0x1EF01C00) { // Internal registers const u32 index = (address - 0x1EF01000) / sizeof(u32); writeInternalReg(index, value, 0xffffffff); } else { @@ -59,7 +60,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } u32 currentValue = regs[index]; - u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask" + u32 newValue = (currentValue & ~mask) | (value & mask); // Only overwrite the bits specified by "mask" regs[index] = newValue; // TODO: Figure out if things like the shader index use the unmasked value or the masked one @@ -74,38 +75,38 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { break; case AttribFormatHigh: - totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes - fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices + totalAttribCount = (value >> 28) + 1; // Total number of vertex attributes + fixedAttribMask = getBits<16, 12>(value); // Determines which vertex attributes are fixed for all vertices break; case ColourBufferLoc: { u32 loc = (value & 0x0fffffff) << 3; - renderer.setColourBufferLoc(loc); + renderer->setColourBufferLoc(loc); break; }; case ColourBufferFormat: { u32 format = getBits<16, 3>(value); - renderer.setColourFormat(static_cast(format)); + renderer->setColourFormat(static_cast(format)); break; } case DepthBufferLoc: { u32 loc = (value & 0x0fffffff) << 3; - renderer.setDepthBufferLoc(loc); + renderer->setDepthBufferLoc(loc); break; } case DepthBufferFormat: { u32 format = value & 0x3; - renderer.setDepthFormat(static_cast(format)); + renderer->setDepthFormat(static_cast(format)); break; } case FramebufferSize: { const u32 width = value & 0x7ff; const u32 height = getBits<12, 10>(value) + 1; - renderer.setFBSize(width, height); + renderer->setFBSize(width, height); break; } @@ -116,7 +117,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { case LightingLUTData4: case LightingLUTData5: case LightingLUTData6: - case LightingLUTData7:{ + case LightingLUTData7: { const uint32_t index = regs[LightingLUTIndex]; // Get full LUT index register const uint32_t lutID = getBits<8, 5>(index); // Get which LUT we're actually writing to uint32_t lutIndex = getBits<0, 8>(index); // And get the index inside the LUT we're writing to @@ -133,15 +134,16 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { break; } - case VertexFloatUniformIndex: - shaderUnit.vs.setFloatUniformIndex(value); - break; + case VertexFloatUniformIndex: shaderUnit.vs.setFloatUniformIndex(value); break; - case VertexFloatUniformData0: case VertexFloatUniformData1: case VertexFloatUniformData2: - case VertexFloatUniformData3: case VertexFloatUniformData4: case VertexFloatUniformData5: - case VertexFloatUniformData6: case VertexFloatUniformData7: - shaderUnit.vs.uploadFloatUniform(value); - break; + case VertexFloatUniformData0: + case VertexFloatUniformData1: + case VertexFloatUniformData2: + case VertexFloatUniformData3: + case VertexFloatUniformData4: + case VertexFloatUniformData5: + case VertexFloatUniformData6: + case VertexFloatUniformData7: shaderUnit.vs.uploadFloatUniform(value); break; case FixedAttribIndex: fixedAttribCount = 0; @@ -162,7 +164,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } break; - case FixedAttribData0: case FixedAttribData1: case FixedAttribData2: + case FixedAttribData0: + case FixedAttribData1: + case FixedAttribData2: fixedAttrBuff[fixedAttribCount++] = value; if (fixedAttribCount == 3) { @@ -170,15 +174,15 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { vec4f attr; // These are stored in the reverse order anyone would expect them to be in - attr.x() = f24::fromRaw(fixedAttrBuff[2] & 0xffffff); - attr.y() = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24)); - attr.z() = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16)); - attr.w() = f24::fromRaw(fixedAttrBuff[0] >> 8); + attr[0] = f24::fromRaw(fixedAttrBuff[2] & 0xffffff); + attr[1] = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24)); + attr[2] = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16)); + attr[3] = f24::fromRaw(fixedAttrBuff[0] >> 8); // If the fixed attribute index is < 12, we're just writing to one of the fixed attributes if (fixedAttribIndex < 12) [[likely]] { shaderUnit.vs.fixedAttributes[fixedAttribIndex++] = attr; - } else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex + } else if (fixedAttribIndex == 15) { // Otherwise if it's 15, we're submitting an immediate mode vertex const uint totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1; if (totalAttrCount <= immediateModeAttrIndex) { printf("Broken state in the immediate mode vertex submission pipeline. Failing silently\n"); @@ -199,13 +203,12 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { - renderer.drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); + renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { // Triangle or geometry primitive. Draw a triangle and discard all vertices - case 0: case 3: - immediateModeVertIndex = 0; - break; + case 0: + case 3: immediateModeVertIndex = 0; break; // Triangle strip. Draw triangle, discard first vertex and keep the last 2 case 1: @@ -223,54 +226,54 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } } } - } else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see + } else { // Writing to fixed attributes 13 and 14 probably does nothing, but we'll see log("Wrote to invalid fixed vertex attribute %d\n", fixedAttribIndex); } } break; - case VertexShaderOpDescriptorIndex: - shaderUnit.vs.setOpDescriptorIndex(value); - break; + case VertexShaderOpDescriptorIndex: shaderUnit.vs.setOpDescriptorIndex(value); break; - case VertexShaderOpDescriptorData0: case VertexShaderOpDescriptorData1: case VertexShaderOpDescriptorData2: - case VertexShaderOpDescriptorData3: case VertexShaderOpDescriptorData4: case VertexShaderOpDescriptorData5: - case VertexShaderOpDescriptorData6: case VertexShaderOpDescriptorData7: - shaderUnit.vs.uploadDescriptor(value); - break; + case VertexShaderOpDescriptorData0: + case VertexShaderOpDescriptorData1: + case VertexShaderOpDescriptorData2: + case VertexShaderOpDescriptorData3: + case VertexShaderOpDescriptorData4: + case VertexShaderOpDescriptorData5: + case VertexShaderOpDescriptorData6: + case VertexShaderOpDescriptorData7: shaderUnit.vs.uploadDescriptor(value); break; - case VertexBoolUniform: - shaderUnit.vs.boolUniform = value & 0xffff; - break; + case VertexBoolUniform: shaderUnit.vs.boolUniform = value & 0xffff; break; - case VertexIntUniform0: case VertexIntUniform1: case VertexIntUniform2: case VertexIntUniform3: - shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value); - break; + case VertexIntUniform0: + case VertexIntUniform1: + case VertexIntUniform2: + case VertexIntUniform3: shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value); break; - case VertexShaderData0: case VertexShaderData1: case VertexShaderData2: case VertexShaderData3: - case VertexShaderData4: case VertexShaderData5: case VertexShaderData6: case VertexShaderData7: - shaderUnit.vs.uploadWord(value); - break; + case VertexShaderData0: + case VertexShaderData1: + case VertexShaderData2: + case VertexShaderData3: + case VertexShaderData4: + case VertexShaderData5: + case VertexShaderData6: + case VertexShaderData7: shaderUnit.vs.uploadWord(value); break; - case VertexShaderEntrypoint: - shaderUnit.vs.entrypoint = value & 0xffff; - break; + case VertexShaderEntrypoint: shaderUnit.vs.entrypoint = value & 0xffff; break; case VertexShaderTransferEnd: if (value != 0) shaderUnit.vs.finalize(); break; - case VertexShaderTransferIndex: - shaderUnit.vs.setBufferIndex(value); - break; + case VertexShaderTransferIndex: shaderUnit.vs.setBufferIndex(value); break; // Command lists can write to the command processor registers and change the command list stream // Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land case CmdBufTrigger0: case CmdBufTrigger1: { - if (value != 0) { // A non-zero value triggers command list processing - int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1) + if (value != 0) { // A non-zero value triggers command list processing + int bufferIndex = index - CmdBufTrigger0; // Index of the command buffer to execute (0 or 1) u32 addr = (regs[CmdBufAddr0 + bufferIndex] & 0xfffffff) << 3; u32 size = (regs[CmdBufSize0 + bufferIndex] & 0xfffff) << 3; @@ -285,15 +288,13 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { default: // Vertex attribute registers if (index >= AttribInfoStart && index <= AttribInfoEnd) { - uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to - uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to? + uint attributeIndex = (index - AttribInfoStart) / 3; // Which attribute are we writing to + uint reg = (index - AttribInfoStart) % 3; // Which of this attribute's registers are we writing to? auto& attr = attributeInfo[attributeIndex]; switch (reg) { - case 0: attr.offset = value & 0xfffffff; break; // Attribute offset - case 1: - attr.config1 = value; - break; + case 0: attr.offset = value & 0xfffffff; break; // Attribute offset + case 1: attr.config1 = value; break; case 2: attr.config2 = value; attr.size = getBits<16, 8>(value); @@ -339,13 +340,13 @@ void GPU::startCommandList(u32 addr, u32 size) { u32 id = header & 0xffff; u32 paramMaskIndex = getBits<16, 4>(header); - u32 paramCount = getBits<20, 8>(header); // Number of additional parameters + u32 paramCount = getBits<20, 8>(header); // Number of additional parameters // Bit 31 tells us whether this command is going to write to multiple sequential registers (if the bit is 1) // Or if all written values will go to the same register (If the bit is 0). It's essentially the value that // gets added to the "id" field after each register write bool consecutiveWritingMode = (header >> 31) != 0; - u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask + u32 mask = maskLUT[paramMaskIndex]; // Actual parameter mask // Increment the ID by 1 after each write if we're in consecutive mode, or 0 otherwise u32 idIncrement = (consecutiveWritingMode) ? 1 : 0; diff --git a/src/core/PICA/shader_interpreter.cpp b/src/core/PICA/shader_interpreter.cpp index 7af284e3..28eee3c7 100644 --- a/src/core/PICA/shader_interpreter.cpp +++ b/src/core/PICA/shader_interpreter.cpp @@ -1,6 +1,7 @@ -#include "PICA/shader.hpp" #include +#include "PICA/shader.hpp" + using namespace Helpers; void PICAShader::run() { @@ -11,20 +12,19 @@ void PICAShader::run() { while (true) { const u32 instruction = loadedShader[pc++]; - const u32 opcode = instruction >> 26; // Top 6 bits are the opcode + const u32 opcode = instruction >> 26; // Top 6 bits are the opcode switch (opcode) { case ShaderOpcodes::ADD: add(instruction); break; case ShaderOpcodes::CALL: call(instruction); break; case ShaderOpcodes::CALLC: callc(instruction); break; case ShaderOpcodes::CALLU: callu(instruction); break; - case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: - cmp(instruction); - break; + case ShaderOpcodes::CMP1: + case ShaderOpcodes::CMP2: cmp(instruction); break; case ShaderOpcodes::DP3: dp3(instruction); break; case ShaderOpcodes::DP4: dp4(instruction); break; case ShaderOpcodes::DPHI: dphi(instruction); break; - case ShaderOpcodes::END: return; // Stop running shader + case ShaderOpcodes::END: return; // Stop running shader case ShaderOpcodes::EX2: ex2(instruction); break; case ShaderOpcodes::FLR: flr(instruction); break; case ShaderOpcodes::IFC: ifc(instruction); break; @@ -38,31 +38,41 @@ void PICAShader::run() { case ShaderOpcodes::MOV: mov(instruction); break; case ShaderOpcodes::MOVA: mova(instruction); break; case ShaderOpcodes::MUL: mul(instruction); break; - case ShaderOpcodes::NOP: break; // Do nothing + case ShaderOpcodes::NOP: break; // Do nothing case ShaderOpcodes::RCP: rcp(instruction); break; case ShaderOpcodes::RSQ: rsq(instruction); break; case ShaderOpcodes::SGEI: sgei(instruction); break; case ShaderOpcodes::SLT: slt(instruction); break; case ShaderOpcodes::SLTI: slti(instruction); break; - case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: - madi(instruction); - break; + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: madi(instruction); break; - case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F: - mad(instruction); - break; + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: mad(instruction); break; - default:Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode); + default: Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode); } // Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL // Handle loop if (loopIndex != 0) { auto& loop = loopInfo[loopIndex - 1]; - if (pc == loop.endingPC) { // Check if the loop needs to start over + if (pc == loop.endingPC) { // Check if the loop needs to start over loop.iterations -= 1; - if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack + if (loop.iterations == 0) // If the loop ended, go one level down on the loop stack loopIndex -= 1; loopCounter += loop.increment; @@ -73,7 +83,7 @@ void PICAShader::run() { // Handle ifs if (ifIndex != 0) { auto& info = conditionalInfo[ifIndex - 1]; - if (pc == info.endingPC) { // Check if the IF block ended + if (pc == info.endingPC) { // Check if the IF block ended pc = info.newPC; ifIndex -= 1; } @@ -82,7 +92,7 @@ void PICAShader::run() { // Handle calls if (callIndex != 0) { auto& info = callInfo[callIndex - 1]; - if (pc == info.endingPC) { // Check if the CALL block ended + if (pc == info.endingPC) { // Check if the CALL block ended pc = info.returnPC; callIndex -= 1; } @@ -92,15 +102,15 @@ void PICAShader::run() { // Calculate the actual source value using an instruction's source field and it's respective index value // The index value is used to apply relative addressing when index != 0 by adding one of the 3 addr registers to the -// source field, but only with the original source field is pointing at a vector uniform register +// source field, but only with the original source field is pointing at a vector uniform register u8 PICAShader::getIndexedSource(u32 source, u32 index) { - if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg + if (source < 0x20) // No offset is applied if the source isn't pointing to a vector uniform reg return source; switch (index) { - case 0: [[likely]] return u8(source); // No offset applied - case 1: return u8(source + addrRegister.x()); - case 2: return u8(source + addrRegister.y()); + case 0: [[likely]] return u8(source); // No offset applied + case 1: return u8(source + addrRegister[0]); + case 2: return u8(source + addrRegister[1]); case 3: return u8(source + loopCounter); } @@ -117,7 +127,7 @@ PICAShader::vec4f PICAShader::getSource(u32 source) { return floatUniforms[source - 0x20]; else { Helpers::warn("[PICA] Unimplemented source value: %X\n", source); - return vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); + return vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); } } @@ -136,13 +146,13 @@ bool PICAShader::isCondTrue(u32 instruction) { bool refX = (getBit<25>(instruction)) != 0; switch (condition) { - case 0: // Either cmp register matches + case 0: // Either cmp register matches return cmpRegister[0] == refX || cmpRegister[1] == refY; - case 1: // Both cmp registers match + case 1: // Both cmp registers match return cmpRegister[0] == refX && cmpRegister[1] == refY; - case 2: // At least cmp.x matches + case 2: // At least cmp.x matches return cmpRegister[0] == refX; - default: // At least cmp.y matches + default: // At least cmp.y matches return cmpRegister[1] == refY; } } @@ -150,7 +160,7 @@ bool PICAShader::isCondTrue(u32 instruction) { void PICAShader::add(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -171,7 +181,7 @@ void PICAShader::add(u32 instruction) { void PICAShader::mul(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -210,7 +220,7 @@ void PICAShader::flr(u32 instruction) { void PICAShader::max(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -232,7 +242,7 @@ void PICAShader::max(u32 instruction) { void PICAShader::min(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -278,16 +288,16 @@ void PICAShader::mova(u32 instruction) { vec4f srcVector = getSourceSwizzled<1>(src, operandDescriptor); u32 componentMask = operandDescriptor & 0xf; - if (componentMask & 0b1000) // x component - addrRegister.x() = static_cast(srcVector.x().toFloat32()); - if (componentMask & 0b0100) // y component - addrRegister.y() = static_cast(srcVector.y().toFloat32()); + if (componentMask & 0b1000) // x component + addrRegister[0] = static_cast(srcVector[0].toFloat32()); + if (componentMask & 0b0100) // y component + addrRegister[1] = static_cast(srcVector[1].toFloat32()); } void PICAShader::dp3(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -309,7 +319,7 @@ void PICAShader::dp3(u32 instruction) { void PICAShader::dp4(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -480,7 +490,7 @@ void PICAShader::madi(u32 instruction) { void PICAShader::slt(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 dest = getBits<21, 5>(instruction); @@ -542,11 +552,11 @@ void PICAShader::slti(u32 instruction) { void PICAShader::cmp(u32 instruction) { const u32 operandDescriptor = operandDescriptors[instruction & 0x7f]; const u32 src1 = getBits<12, 7>(instruction); - const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment + const u32 src2 = getBits<7, 5>(instruction); // src2 coming first because PICA moment const u32 idx = getBits<19, 2>(instruction); const u32 cmpY = getBits<21, 3>(instruction); const u32 cmpX = getBits<24, 3>(instruction); - const u32 cmpOperations[2] = { cmpX, cmpY }; + const u32 cmpOperations[2] = {cmpX, cmpY}; if (idx) Helpers::panic("[PICA] CMP: idx != 0"); vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor); @@ -554,33 +564,31 @@ void PICAShader::cmp(u32 instruction) { for (int i = 0; i < 2; i++) { switch (cmpOperations[i]) { - case 0: // Equal + case 0: // Equal cmpRegister[i] = srcVec1[i] == srcVec2[i]; break; - case 1: // Not equal + case 1: // Not equal cmpRegister[i] = srcVec1[i] != srcVec2[i]; break; - case 2: // Less than + case 2: // Less than cmpRegister[i] = srcVec1[i] < srcVec2[i]; break; - case 3: // Less than or equal + case 3: // Less than or equal cmpRegister[i] = srcVec1[i] <= srcVec2[i]; break; - case 4: // Greater than + case 4: // Greater than cmpRegister[i] = srcVec1[i] > srcVec2[i]; break; - case 5: // Greater than or equal + case 5: // Greater than or equal cmpRegister[i] = srcVec1[i] >= srcVec2[i]; break; - default: - cmpRegister[i] = true; - break; + default: cmpRegister[i] = true; break; } } } @@ -604,7 +612,7 @@ void PICAShader::ifc(u32 instruction) { void PICAShader::ifu(u32 instruction) { const u32 dest = getBits<10, 12>(instruction); - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check if (boolUniform & (1 << bit)) { if (ifIndex >= 8) [[unlikely]] @@ -615,8 +623,7 @@ void PICAShader::ifu(u32 instruction) { auto& block = conditionalInfo[ifIndex++]; block.endingPC = dest; block.newPC = dest + num; - } - else { + } else { pc = dest; } } @@ -637,12 +644,12 @@ void PICAShader::call(u32 instruction) { void PICAShader::callc(u32 instruction) { if (isCondTrue(instruction)) { - call(instruction); // Pls inline + call(instruction); // Pls inline } } void PICAShader::callu(u32 instruction) { - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check if (boolUniform & (1 << bit)) { if (callIndex >= 4) [[unlikely]] @@ -664,26 +671,25 @@ void PICAShader::loop(u32 instruction) { Helpers::panic("[PICA] Overflowed loop stack"); u32 dest = getBits<10, 12>(instruction); - auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from - loopCounter = uniform.y(); + auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from + loopCounter = uniform[1]; auto& loop = loopInfo[loopIndex++]; loop.startingPC = pc; - loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here - loop.iterations = uniform.x() + 1; - loop.increment = uniform.z(); + loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here + loop.iterations = uniform[0] + 1; + loop.increment = uniform[2]; } void PICAShader::jmpc(u32 instruction) { - if (isCondTrue(instruction)) - pc = getBits<10, 12>(instruction); + if (isCondTrue(instruction)) pc = getBits<10, 12>(instruction); } void PICAShader::jmpu(u32 instruction) { - const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false + const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we want to compare to true, otherwise compare to false const u32 dest = getBits<10, 12>(instruction); - const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check - if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want + if (((boolUniform >> bit) & 1) == test) // Jump if the bool uniform is the value we want pc = dest; } \ No newline at end of file diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp index 6cbc2693..aa7b4c12 100644 --- a/src/core/PICA/shader_unit.cpp +++ b/src/core/PICA/shader_unit.cpp @@ -1,4 +1,5 @@ #include "PICA/shader_unit.hpp" + #include "cityhash.hpp" void ShaderUnit::reset() { @@ -18,18 +19,18 @@ void PICAShader::reset() { opDescriptorIndex = 0; f32UniformTransfer = false; - const vec4f zero = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); + const vec4f zero = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); inputs.fill(zero); floatUniforms.fill(zero); outputs.fill(zero); tempRegisters.fill(zero); for (auto& e : intUniforms) { - e.x() = e.y() = e.z() = e.w() = 0; + e[0] = e[1] = e[2] = e[3] = 0; } - addrRegister.x() = 0; - addrRegister.y() = 0; + addrRegister[0] = 0; + addrRegister[1] = 0; loopCounter = 0; codeHashDirty = true;