Remove OpenGL-specific vector-types

Removes dependency on the OpenGL header and rendering backen for its
`OpenGL::Vector` type in favor of a more standard array.
This commit is contained in:
Wunkolo 2023-07-10 08:55:23 -07:00
parent 2a1683ba62
commit 9e32b6d4bf
5 changed files with 221 additions and 224 deletions

View file

@ -2,14 +2,14 @@
// Only do anything if we're on an x64 target with JIT support enabled // Only do anything if we're on an x64 target with JIT support enabled
#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST) #if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST)
#include <vector>
#include "PICA/shader.hpp"
#include "helpers.hpp" #include "helpers.hpp"
#include "logger.hpp" #include "logger.hpp"
#include "PICA/shader.hpp" #include "x64_regs.hpp"
#include "xbyak/xbyak.h" #include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h" #include "xbyak/xbyak_util.h"
#include "x64_regs.hpp"
#include <vector>
class ShaderEmitter : public Xbyak::CodeGenerator { class ShaderEmitter : public Xbyak::CodeGenerator {
static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader
@ -20,7 +20,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
static constexpr uint noSwizzle = 0x1B; static constexpr uint noSwizzle = 0x1B;
using f24 = Floats::f24; using f24 = Floats::f24;
using vec4f = OpenGL::Vector<f24, 4>; using vec4f = std::array<f24, 4>;
// An array of labels (incl pointers) to each compiled (to x64) PICA instruction // An array of labels (incl pointers) to each compiled (to x64) PICA instruction
std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels; std::array<Xbyak::Label, PICAShader::maxInstructionCount> instructionLabels;
@ -105,10 +105,10 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
MAKE_LOG_FUNCTION(log, shaderJITLogger) MAKE_LOG_FUNCTION(log, shaderJITLogger)
public: public:
using InstructionCallback = const void(*)(PICAShader& shaderUnit); // Callback type used for instructions using InstructionCallback = const void (*)(PICAShader& shaderUnit); // Callback type used for instructions
// Callback type used for the JIT prologue. This is what the caller will call // Callback type used for the JIT prologue. This is what the caller will call
using PrologueCallback = const void(*)(PICAShader& shaderUnit, InstructionCallback cb); using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb);
PrologueCallback prologueCb = nullptr; PrologueCallback prologueCb = nullptr;
// Initialize our emitter with "allocSize" bytes of RWX memory // Initialize our emitter with "allocSize" bytes of RWX memory
@ -133,9 +133,7 @@ public:
return reinterpret_cast<InstructionCallback>(ptr); return reinterpret_cast<InstructionCallback>(ptr);
} }
PrologueCallback getPrologueCallback() { PrologueCallback getPrologueCallback() { return prologueCb; }
return prologueCb;
}
}; };
#endif // x64 recompiler check #endif // x64 recompiler check

View file

@ -2,14 +2,12 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <cstring> #include <cstring>
#include "helpers.hpp"
#include "opengl.hpp"
#include "PICA/float_types.hpp" #include "PICA/float_types.hpp"
#include "PICA/pica_hash.hpp" #include "PICA/pica_hash.hpp"
#include "helpers.hpp"
enum class ShaderType { enum class ShaderType { Vertex, Geometry };
Vertex, Geometry
};
namespace ShaderOpcodes { namespace ShaderOpcodes {
enum : u32 { enum : u32 {
@ -55,7 +53,7 @@ namespace ShaderOpcodes {
// Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT // Note: All PICA f24 vec4 registers must have the alignas(16) specifier to make them easier to access in SSE/NEON code in the JIT
class PICAShader { class PICAShader {
using f24 = Floats::f24; using f24 = Floats::f24;
using vec4f = OpenGL::Vector<f24, 4>; using vec4f = std::array<f24, 4>;
struct Loop { struct Loop {
u32 startingPC; // PC at the start of the loop u32 startingPC; // PC at the start of the loop
@ -83,22 +81,22 @@ class PICAShader {
std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data std::array<u32, 4> floatUniformBuffer; // Buffer for temporarily caching float uniform data
public: public:
// These are placed close to the temp registers and co because it helps the JIT generate better code // These are placed close to the temp registers and co because it helps the JIT generate better code
u32 entrypoint = 0; // Initial shader PC u32 entrypoint = 0; // Initial shader PC
u32 boolUniform; u32 boolUniform;
std::array<OpenGL::Vector<u8, 4>, 4> intUniforms; std::array<std::array<u8, 4>, 4> intUniforms;
alignas(16) std::array<vec4f, 96> floatUniforms; alignas(16) std::array<vec4f, 96> floatUniforms;
alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes alignas(16) std::array<vec4f, 16> fixedAttributes; // Fixed vertex attributes
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs; alignas(16) std::array<vec4f, 16> outputs;
alignas(16) vec4f dummy = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); // Dummy register used by the JIT alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT
protected: protected:
std::array<u32, 128> operandDescriptors; std::array<u32, 128> operandDescriptors;
alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values alignas(16) std::array<vec4f, 16> tempRegisters; // General purpose registers the shader can use for temp values
OpenGL::Vector<s32, 2> addrRegister; // Address register std::array<s32, 2> addrRegister; // Address register
bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in bool cmpRegister[2]; // Comparison registers where the result of CMP is stored in
u32 loopCounter; u32 loopCounter;
@ -130,7 +128,7 @@ protected:
vec4f getSource(u32 source); vec4f getSource(u32 source);
vec4f& getDest(u32 dest); vec4f& getDest(u32 dest);
private: private:
// Interpreter functions for the various shader functions // Interpreter functions for the various shader functions
void add(u32 instruction); void add(u32 instruction);
void call(u32 instruction); void call(u32 instruction);
@ -212,7 +210,7 @@ private:
u8 getIndexedSource(u32 source, u32 index); u8 getIndexedSource(u32 source, u32 index);
bool isCondTrue(u32 instruction); bool isCondTrue(u32 instruction);
public: public:
static constexpr size_t maxInstructionCount = 4096; static constexpr size_t maxInstructionCount = 4096;
std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader std::array<u32, maxInstructionCount> loadedShader; // Currently loaded & active shader
std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to std::array<u32, maxInstructionCount> bufferedShader; // Shader to be transferred when the SH_CODETRANSFER_END reg gets written to
@ -220,17 +218,11 @@ public:
PICAShader(ShaderType type) : type(type) {} PICAShader(ShaderType type) : type(type) {}
// Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them // Theese functions are in the header to be inlined more easily, though with LTO I hope I'll be able to move them
void finalize() { void finalize() { std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32)); }
std::memcpy(&loadedShader[0], &bufferedShader[0], 4096 * sizeof(u32));
}
void setBufferIndex(u32 index) { void setBufferIndex(u32 index) { bufferIndex = index & 0xfff; }
bufferIndex = index & 0xfff;
}
void setOpDescriptorIndex(u32 index) { void setOpDescriptorIndex(u32 index) { opDescriptorIndex = index & 0x7f; }
opDescriptorIndex = index & 0x7f;
}
void uploadWord(u32 word) { void uploadWord(u32 word) {
if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew"); if (bufferIndex >= 4095) Helpers::panic("o no, shader upload overflew");
@ -255,23 +247,22 @@ public:
void uploadFloatUniform(u32 word) { void uploadFloatUniform(u32 word) {
floatUniformBuffer[floatUniformWordCount++] = word; floatUniformBuffer[floatUniformWordCount++] = word;
if (floatUniformIndex >= 96) if (floatUniformIndex >= 96) Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex);
Helpers::panic("[PICA] Tried to write float uniform %d", floatUniformIndex);
if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) { if ((f32UniformTransfer && floatUniformWordCount >= 4) || (!f32UniformTransfer && floatUniformWordCount >= 3)) {
vec4f& uniform = floatUniforms[floatUniformIndex++]; vec4f& uniform = floatUniforms[floatUniformIndex++];
floatUniformWordCount = 0; floatUniformWordCount = 0;
if (f32UniformTransfer) { if (f32UniformTransfer) {
uniform.x() = f24::fromFloat32(*(float*)&floatUniformBuffer[3]); uniform[0] = f24::fromFloat32(*(float*)&floatUniformBuffer[3]);
uniform.y() = f24::fromFloat32(*(float*)&floatUniformBuffer[2]); uniform[1] = f24::fromFloat32(*(float*)&floatUniformBuffer[2]);
uniform.z() = f24::fromFloat32(*(float*)&floatUniformBuffer[1]); uniform[2] = f24::fromFloat32(*(float*)&floatUniformBuffer[1]);
uniform.w() = f24::fromFloat32(*(float*)&floatUniformBuffer[0]); uniform[3] = f24::fromFloat32(*(float*)&floatUniformBuffer[0]);
} else { } else {
uniform.x() = f24::fromRaw(floatUniformBuffer[2] & 0xffffff); uniform[0] = f24::fromRaw(floatUniformBuffer[2] & 0xffffff);
uniform.y() = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24)); uniform[1] = f24::fromRaw(((floatUniformBuffer[1] & 0xffff) << 8) | (floatUniformBuffer[2] >> 24));
uniform.z() = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
uniform.w() = f24::fromRaw(floatUniformBuffer[0] >> 8); uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
} }
} }
} }
@ -280,10 +271,10 @@ public:
using namespace Helpers; using namespace Helpers;
auto& u = intUniforms[index]; auto& u = intUniforms[index];
u.x() = word & 0xff; u[0] = word & 0xff;
u.y() = getBits<8, 8>(word); u[1] = getBits<8, 8>(word);
u.z() = getBits<16, 8>(word); u[2] = getBits<16, 8>(word);
u.w() = getBits<24, 8>(word); u[3] = getBits<24, 8>(word);
} }
void run(); void run();

View file

@ -1,6 +1,7 @@
#include "PICA/gpu.hpp"
#include "PICA/regs.hpp" #include "PICA/regs.hpp"
#include "PICA/gpu.hpp"
using namespace Floats; using namespace Floats;
using namespace Helpers; using namespace Helpers;
@ -80,32 +81,32 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
case ColourBufferLoc: { case ColourBufferLoc: {
u32 loc = (value & 0x0fffffff) << 3; u32 loc = (value & 0x0fffffff) << 3;
renderer.setColourBufferLoc(loc); renderer->setColourBufferLoc(loc);
break; break;
}; };
case ColourBufferFormat: { case ColourBufferFormat: {
u32 format = getBits<16, 3>(value); u32 format = getBits<16, 3>(value);
renderer.setColourFormat(static_cast<PICA::ColorFmt>(format)); renderer->setColourFormat(static_cast<PICA::ColorFmt>(format));
break; break;
} }
case DepthBufferLoc: { case DepthBufferLoc: {
u32 loc = (value & 0x0fffffff) << 3; u32 loc = (value & 0x0fffffff) << 3;
renderer.setDepthBufferLoc(loc); renderer->setDepthBufferLoc(loc);
break; break;
} }
case DepthBufferFormat: { case DepthBufferFormat: {
u32 format = value & 0x3; u32 format = value & 0x3;
renderer.setDepthFormat(static_cast<PICA::DepthFmt>(format)); renderer->setDepthFormat(static_cast<PICA::DepthFmt>(format));
break; break;
} }
case FramebufferSize: { case FramebufferSize: {
const u32 width = value & 0x7ff; const u32 width = value & 0x7ff;
const u32 height = getBits<12, 10>(value) + 1; const u32 height = getBits<12, 10>(value) + 1;
renderer.setFBSize(width, height); renderer->setFBSize(width, height);
break; break;
} }
@ -116,7 +117,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
case LightingLUTData4: case LightingLUTData4:
case LightingLUTData5: case LightingLUTData5:
case LightingLUTData6: case LightingLUTData6:
case LightingLUTData7:{ case LightingLUTData7: {
const uint32_t index = regs[LightingLUTIndex]; // Get full LUT index register const uint32_t index = regs[LightingLUTIndex]; // Get full LUT index register
const uint32_t lutID = getBits<8, 5>(index); // Get which LUT we're actually writing to const uint32_t lutID = getBits<8, 5>(index); // Get which LUT we're actually writing to
uint32_t lutIndex = getBits<0, 8>(index); // And get the index inside the LUT we're writing to uint32_t lutIndex = getBits<0, 8>(index); // And get the index inside the LUT we're writing to
@ -133,15 +134,16 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
break; break;
} }
case VertexFloatUniformIndex: case VertexFloatUniformIndex: shaderUnit.vs.setFloatUniformIndex(value); break;
shaderUnit.vs.setFloatUniformIndex(value);
break;
case VertexFloatUniformData0: case VertexFloatUniformData1: case VertexFloatUniformData2: case VertexFloatUniformData0:
case VertexFloatUniformData3: case VertexFloatUniformData4: case VertexFloatUniformData5: case VertexFloatUniformData1:
case VertexFloatUniformData6: case VertexFloatUniformData7: case VertexFloatUniformData2:
shaderUnit.vs.uploadFloatUniform(value); case VertexFloatUniformData3:
break; case VertexFloatUniformData4:
case VertexFloatUniformData5:
case VertexFloatUniformData6:
case VertexFloatUniformData7: shaderUnit.vs.uploadFloatUniform(value); break;
case FixedAttribIndex: case FixedAttribIndex:
fixedAttribCount = 0; fixedAttribCount = 0;
@ -162,7 +164,9 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
} }
break; break;
case FixedAttribData0: case FixedAttribData1: case FixedAttribData2: case FixedAttribData0:
case FixedAttribData1:
case FixedAttribData2:
fixedAttrBuff[fixedAttribCount++] = value; fixedAttrBuff[fixedAttribCount++] = value;
if (fixedAttribCount == 3) { if (fixedAttribCount == 3) {
@ -170,10 +174,10 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
vec4f attr; vec4f attr;
// These are stored in the reverse order anyone would expect them to be in // These are stored in the reverse order anyone would expect them to be in
attr.x() = f24::fromRaw(fixedAttrBuff[2] & 0xffffff); attr[0] = f24::fromRaw(fixedAttrBuff[2] & 0xffffff);
attr.y() = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24)); attr[1] = f24::fromRaw(((fixedAttrBuff[1] & 0xffff) << 8) | (fixedAttrBuff[2] >> 24));
attr.z() = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16)); attr[2] = f24::fromRaw(((fixedAttrBuff[0] & 0xff) << 16) | (fixedAttrBuff[1] >> 16));
attr.w() = f24::fromRaw(fixedAttrBuff[0] >> 8); attr[3] = f24::fromRaw(fixedAttrBuff[0] >> 8);
// If the fixed attribute index is < 12, we're just writing to one of the fixed attributes // If the fixed attribute index is < 12, we're just writing to one of the fixed attributes
if (fixedAttribIndex < 12) [[likely]] { if (fixedAttribIndex < 12) [[likely]] {
@ -199,13 +203,12 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
// If we've reached 3 verts, issue a draw call // If we've reached 3 verts, issue a draw call
// Handle rendering depending on the primitive type // Handle rendering depending on the primitive type
if (immediateModeVertIndex == 3) { if (immediateModeVertIndex == 3) {
renderer.drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
switch (primType) { switch (primType) {
// Triangle or geometry primitive. Draw a triangle and discard all vertices // Triangle or geometry primitive. Draw a triangle and discard all vertices
case 0: case 3: case 0:
immediateModeVertIndex = 0; case 3: immediateModeVertIndex = 0; break;
break;
// Triangle strip. Draw triangle, discard first vertex and keep the last 2 // Triangle strip. Draw triangle, discard first vertex and keep the last 2
case 1: case 1:
@ -230,40 +233,40 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
break; break;
case VertexShaderOpDescriptorIndex: case VertexShaderOpDescriptorIndex: shaderUnit.vs.setOpDescriptorIndex(value); break;
shaderUnit.vs.setOpDescriptorIndex(value);
break;
case VertexShaderOpDescriptorData0: case VertexShaderOpDescriptorData1: case VertexShaderOpDescriptorData2: case VertexShaderOpDescriptorData0:
case VertexShaderOpDescriptorData3: case VertexShaderOpDescriptorData4: case VertexShaderOpDescriptorData5: case VertexShaderOpDescriptorData1:
case VertexShaderOpDescriptorData6: case VertexShaderOpDescriptorData7: case VertexShaderOpDescriptorData2:
shaderUnit.vs.uploadDescriptor(value); case VertexShaderOpDescriptorData3:
break; case VertexShaderOpDescriptorData4:
case VertexShaderOpDescriptorData5:
case VertexShaderOpDescriptorData6:
case VertexShaderOpDescriptorData7: shaderUnit.vs.uploadDescriptor(value); break;
case VertexBoolUniform: case VertexBoolUniform: shaderUnit.vs.boolUniform = value & 0xffff; break;
shaderUnit.vs.boolUniform = value & 0xffff;
break;
case VertexIntUniform0: case VertexIntUniform1: case VertexIntUniform2: case VertexIntUniform3: case VertexIntUniform0:
shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value); case VertexIntUniform1:
break; case VertexIntUniform2:
case VertexIntUniform3: shaderUnit.vs.uploadIntUniform(index - VertexIntUniform0, value); break;
case VertexShaderData0: case VertexShaderData1: case VertexShaderData2: case VertexShaderData3: case VertexShaderData0:
case VertexShaderData4: case VertexShaderData5: case VertexShaderData6: case VertexShaderData7: case VertexShaderData1:
shaderUnit.vs.uploadWord(value); case VertexShaderData2:
break; case VertexShaderData3:
case VertexShaderData4:
case VertexShaderData5:
case VertexShaderData6:
case VertexShaderData7: shaderUnit.vs.uploadWord(value); break;
case VertexShaderEntrypoint: case VertexShaderEntrypoint: shaderUnit.vs.entrypoint = value & 0xffff; break;
shaderUnit.vs.entrypoint = value & 0xffff;
break;
case VertexShaderTransferEnd: case VertexShaderTransferEnd:
if (value != 0) shaderUnit.vs.finalize(); if (value != 0) shaderUnit.vs.finalize();
break; break;
case VertexShaderTransferIndex: case VertexShaderTransferIndex: shaderUnit.vs.setBufferIndex(value); break;
shaderUnit.vs.setBufferIndex(value);
break;
// Command lists can write to the command processor registers and change the command list stream // Command lists can write to the command processor registers and change the command list stream
// Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land // Several games are known to do this, including New Super Mario Bros 2 and Super Mario 3D Land
@ -291,9 +294,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
switch (reg) { switch (reg) {
case 0: attr.offset = value & 0xfffffff; break; // Attribute offset case 0: attr.offset = value & 0xfffffff; break; // Attribute offset
case 1: case 1: attr.config1 = value; break;
attr.config1 = value;
break;
case 2: case 2:
attr.config2 = value; attr.config2 = value;
attr.size = getBits<16, 8>(value); attr.size = getBits<16, 8>(value);

View file

@ -1,6 +1,7 @@
#include "PICA/shader.hpp"
#include <cmath> #include <cmath>
#include "PICA/shader.hpp"
using namespace Helpers; using namespace Helpers;
void PICAShader::run() { void PICAShader::run() {
@ -18,9 +19,8 @@ void PICAShader::run() {
case ShaderOpcodes::CALL: call(instruction); break; case ShaderOpcodes::CALL: call(instruction); break;
case ShaderOpcodes::CALLC: callc(instruction); break; case ShaderOpcodes::CALLC: callc(instruction); break;
case ShaderOpcodes::CALLU: callu(instruction); break; case ShaderOpcodes::CALLU: callu(instruction); break;
case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: case ShaderOpcodes::CMP1:
cmp(instruction); case ShaderOpcodes::CMP2: cmp(instruction); break;
break;
case ShaderOpcodes::DP3: dp3(instruction); break; case ShaderOpcodes::DP3: dp3(instruction); break;
case ShaderOpcodes::DP4: dp4(instruction); break; case ShaderOpcodes::DP4: dp4(instruction); break;
case ShaderOpcodes::DPHI: dphi(instruction); break; case ShaderOpcodes::DPHI: dphi(instruction); break;
@ -45,15 +45,25 @@ void PICAShader::run() {
case ShaderOpcodes::SLT: slt(instruction); break; case ShaderOpcodes::SLT: slt(instruction); break;
case ShaderOpcodes::SLTI: slti(instruction); break; case ShaderOpcodes::SLTI: slti(instruction); break;
case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: case 0x30:
madi(instruction); case 0x31:
break; case 0x32:
case 0x33:
case 0x34:
case 0x35:
case 0x36:
case 0x37: madi(instruction); break;
case 0x38: case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D: case 0x3E: case 0x3F: case 0x38:
mad(instruction); case 0x39:
break; case 0x3A:
case 0x3B:
case 0x3C:
case 0x3D:
case 0x3E:
case 0x3F: mad(instruction); break;
default:Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode); default: Helpers::panic("Unimplemented PICA instruction %08X (Opcode = %02X)", instruction, opcode);
} }
// Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL // Handle control flow statements. The ordering is important as the priority goes: LOOP > IF > CALL
@ -99,8 +109,8 @@ u8 PICAShader::getIndexedSource(u32 source, u32 index) {
switch (index) { switch (index) {
case 0: [[likely]] return u8(source); // No offset applied case 0: [[likely]] return u8(source); // No offset applied
case 1: return u8(source + addrRegister.x()); case 1: return u8(source + addrRegister[0]);
case 2: return u8(source + addrRegister.y()); case 2: return u8(source + addrRegister[1]);
case 3: return u8(source + loopCounter); case 3: return u8(source + loopCounter);
} }
@ -117,7 +127,7 @@ PICAShader::vec4f PICAShader::getSource(u32 source) {
return floatUniforms[source - 0x20]; return floatUniforms[source - 0x20];
else { else {
Helpers::warn("[PICA] Unimplemented source value: %X\n", source); Helpers::warn("[PICA] Unimplemented source value: %X\n", source);
return vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); return vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
} }
} }
@ -279,9 +289,9 @@ void PICAShader::mova(u32 instruction) {
u32 componentMask = operandDescriptor & 0xf; u32 componentMask = operandDescriptor & 0xf;
if (componentMask & 0b1000) // x component if (componentMask & 0b1000) // x component
addrRegister.x() = static_cast<s32>(srcVector.x().toFloat32()); addrRegister[0] = static_cast<s32>(srcVector[0].toFloat32());
if (componentMask & 0b0100) // y component if (componentMask & 0b0100) // y component
addrRegister.y() = static_cast<s32>(srcVector.y().toFloat32()); addrRegister[1] = static_cast<s32>(srcVector[1].toFloat32());
} }
void PICAShader::dp3(u32 instruction) { void PICAShader::dp3(u32 instruction) {
@ -546,7 +556,7 @@ void PICAShader::cmp(u32 instruction) {
const u32 idx = getBits<19, 2>(instruction); const u32 idx = getBits<19, 2>(instruction);
const u32 cmpY = getBits<21, 3>(instruction); const u32 cmpY = getBits<21, 3>(instruction);
const u32 cmpX = getBits<24, 3>(instruction); const u32 cmpX = getBits<24, 3>(instruction);
const u32 cmpOperations[2] = { cmpX, cmpY }; const u32 cmpOperations[2] = {cmpX, cmpY};
if (idx) Helpers::panic("[PICA] CMP: idx != 0"); if (idx) Helpers::panic("[PICA] CMP: idx != 0");
vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor); vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);
@ -578,9 +588,7 @@ void PICAShader::cmp(u32 instruction) {
cmpRegister[i] = srcVec1[i] >= srcVec2[i]; cmpRegister[i] = srcVec1[i] >= srcVec2[i];
break; break;
default: default: cmpRegister[i] = true; break;
cmpRegister[i] = true;
break;
} }
} }
} }
@ -615,8 +623,7 @@ void PICAShader::ifu(u32 instruction) {
auto& block = conditionalInfo[ifIndex++]; auto& block = conditionalInfo[ifIndex++];
block.endingPC = dest; block.endingPC = dest;
block.newPC = dest + num; block.newPC = dest + num;
} } else {
else {
pc = dest; pc = dest;
} }
} }
@ -665,18 +672,17 @@ void PICAShader::loop(u32 instruction) {
u32 dest = getBits<10, 12>(instruction); u32 dest = getBits<10, 12>(instruction);
auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from auto& uniform = intUniforms[getBits<22, 2>(instruction)]; // The uniform we'll get loop info from
loopCounter = uniform.y(); loopCounter = uniform[1];
auto& loop = loopInfo[loopIndex++]; auto& loop = loopInfo[loopIndex++];
loop.startingPC = pc; loop.startingPC = pc;
loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here loop.endingPC = dest + 1; // Loop is inclusive so we need + 1 here
loop.iterations = uniform.x() + 1; loop.iterations = uniform[0] + 1;
loop.increment = uniform.z(); loop.increment = uniform[2];
} }
void PICAShader::jmpc(u32 instruction) { void PICAShader::jmpc(u32 instruction) {
if (isCondTrue(instruction)) if (isCondTrue(instruction)) pc = getBits<10, 12>(instruction);
pc = getBits<10, 12>(instruction);
} }
void PICAShader::jmpu(u32 instruction) { void PICAShader::jmpu(u32 instruction) {

View file

@ -1,4 +1,5 @@
#include "PICA/shader_unit.hpp" #include "PICA/shader_unit.hpp"
#include "cityhash.hpp" #include "cityhash.hpp"
void ShaderUnit::reset() { void ShaderUnit::reset() {
@ -18,18 +19,18 @@ void PICAShader::reset() {
opDescriptorIndex = 0; opDescriptorIndex = 0;
f32UniformTransfer = false; f32UniformTransfer = false;
const vec4f zero = vec4f({ f24::zero(), f24::zero(), f24::zero(), f24::zero() }); const vec4f zero = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});
inputs.fill(zero); inputs.fill(zero);
floatUniforms.fill(zero); floatUniforms.fill(zero);
outputs.fill(zero); outputs.fill(zero);
tempRegisters.fill(zero); tempRegisters.fill(zero);
for (auto& e : intUniforms) { for (auto& e : intUniforms) {
e.x() = e.y() = e.z() = e.w() = 0; e[0] = e[1] = e[2] = e[3] = 0;
} }
addrRegister.x() = 0; addrRegister[0] = 0;
addrRegister.y() = 0; addrRegister[1] = 0;
loopCounter = 0; loopCounter = 0;
codeHashDirty = true; codeHashDirty = true;