Panda3DS/src/core/PICA/gpu.cpp
Wunkolo ceff20f57f Add configurable Renderer backend
There are still some initialization errors to work through, such as
config not being initialized properly by the time GPU tries to utilize
it too. Also some life-time issues. But manually forcing it to use the
Null backnd successfully works and allows games to be "played"
headlessly.
2023-07-17 10:13:34 -07:00

353 lines
13 KiB
C++

#include "PICA/gpu.hpp"
#include <array>
#include <bitset>
#include <cstddef>
#include <cstdio>
#include "PICA/float_types.hpp"
#include "PICA/regs.hpp"
#include "renderer_null/renderer_null.hpp"
#ifdef PANDA3DS_ENABLE_OPENGL
#include "renderer_gl/renderer_gl.hpp"
#endif
using namespace Floats;
// Note: For when we have multiple backends, the GL state manager can stay here and have the constructor for the Vulkan-or-whatever renderer ignore it
// Thus, our GLStateManager being here does not negatively impact renderer-agnosticness
GPU::GPU(Memory& mem, EmulatorConfig& config) : mem(mem), config(config) {
vram = new u8[vramSize];
mem.setVRAM(vram); // Give the bus a pointer to our VRAM
switch (config.rendererType) {
case RendererType::Null: {
renderer.reset(new RendererNull(*this, regs));
break;
}
#ifdef PANDA3DS_ENABLE_OPENGL
case RendererType::OpenGL: {
renderer.reset(new RendererGL(*this, regs));
break;
}
#endif
default: {
Helpers::panic("Invalid rendering backend index: %d", static_cast<s8>(config.rendererType));
break;
}
}
}
void GPU::reset() {
regs.fill(0);
shaderUnit.reset();
shaderJIT.reset();
std::memset(vram, 0, vramSize);
lightingLUT.fill(0);
lightingLUTDirty = true;
totalAttribCount = 0;
fixedAttribMask = 0;
fixedAttribIndex = 0;
fixedAttribCount = 0;
immediateModeAttrIndex = 0;
immediateModeVertIndex = 0;
fixedAttrBuff.fill(0);
for (auto& e : attributeInfo) {
e.offset = 0;
e.size = 0;
e.config1 = 0;
e.config2 = 0;
}
renderer->reset();
}
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
// And whether we are going to use the shader JIT (second template parameter)
void GPU::drawArrays(bool indexed) {
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
if (indexed) {
if (shaderJITEnabled)
drawArrays<true, true>();
else
drawArrays<true, false>();
} else {
if (shaderJITEnabled)
drawArrays<false, true>();
else
drawArrays<false, false>();
}
}
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
template <bool indexed, bool useShaderJIT>
void GPU::drawArrays() {
if constexpr (useShaderJIT) {
shaderJIT.prepare(shaderUnit.vs);
}
// Base address for vertex attributes
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer
// Configures the type of primitive and the number of vertex shader outputs
const u32 primConfig = regs[PICA::InternalRegs::PrimitiveConfig];
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(primConfig));
if (vertexCount > Renderer::vertexBufferSize) Helpers::panic("[PICA] vertexCount > vertexBufferSize");
if ((primType == PICA::PrimType::TriangleList && vertexCount % 3) || (primType == PICA::PrimType::TriangleStrip && vertexCount < 3) ||
(primType == PICA::PrimType::TriangleFan && vertexCount < 3)) {
Helpers::panic("Invalid vertex count for primitive. Type: %d, vert count: %d\n", primType, vertexCount);
}
// Get the configuration for the index buffer, used only for indexed drawing
u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit
// Stuff the global attribute config registers in one u64 to make attr parsing easier
// TODO: Cache this when the vertex attribute format registers are written to
u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
if constexpr (!indexed) {
u32 offset = regs[PICA::InternalRegs::VertexOffsetReg];
log("PICA::DrawArrays(vertex count = %d, vertexOffset = %d)\n", vertexCount, offset);
} else {
log("PICA::DrawElements(vertex count = %d, index buffer config = %08X)\n", vertexCount, indexBufferConfig);
}
// Total number of input attributes to shader. Differs between GS and VS. Currently stubbed to the VS one, as we don't have geometry shaders.
const u32 inputAttrCount = (regs[PICA::InternalRegs::VertexShaderInputBufferCfg] & 0xf) + 1;
const u64 inputAttrCfg = getVertexShaderInputConfig();
// When doing indexed rendering, we have a cache of vertices to avoid processing attributes and shaders for a single vertex many times
constexpr bool vertexCacheEnabled = true;
constexpr size_t vertexCacheSize = 64;
struct {
std::bitset<vertexCacheSize> validBits{0}; // Shows which tags are valid. If the corresponding bit is 1, then there's an entry
std::array<u32, vertexCacheSize> ids; // IDs (ie indices of the cached vertices in the 3DS vertex buffer)
std::array<u32, vertexCacheSize> bufferPositions; // Positions of the cached vertices in our own vertex buffer
} vertexCache;
for (u32 i = 0; i < vertexCount; i++) {
u32 vertexIndex; // Index of the vertex in the VBO for indexed rendering
if constexpr (!indexed) {
vertexIndex = i + regs[PICA::InternalRegs::VertexOffsetReg];
} else {
if (shortIndex) {
auto ptr = getPointerPhys<u16>(indexBufferPointer);
vertexIndex = *ptr; // TODO: This is very unsafe
indexBufferPointer += 2;
} else {
auto ptr = getPointerPhys<u8>(indexBufferPointer);
vertexIndex = *ptr; // TODO: This is also very unsafe
indexBufferPointer += 1;
}
}
// Check if the vertex corresponding to the index is in cache
if constexpr (indexed && vertexCacheEnabled) {
auto& cache = vertexCache;
size_t tag = vertexIndex % vertexCacheSize;
// Cache hit
if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
vertices[i] = vertices[cache.bufferPositions[tag]];
continue;
}
// Cache miss. Set cache entry, fetch attributes and run shaders as normal
else {
cache.validBits[tag] = true;
cache.ids[tag] = vertexIndex;
cache.bufferPositions[tag] = i;
}
}
int attrCount = 0;
int buffer = 0; // Vertex buffer index for non-fixed attributes
while (attrCount < totalAttribCount) {
// Check if attribute is fixed or not
if (fixedAttribMask & (1 << attrCount)) { // Fixed attribute
vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; // TODO: Is this how it works?
vec4f& inputAttr = currentAttributes[attrCount];
std::memcpy(&inputAttr, &fixedAttr, sizeof(vec4f)); // Copy fixed attr to input attr
attrCount++;
} else { // Non-fixed attribute
auto& attr = attributeInfo[buffer]; // Get information for this attribute
u64 attrCfg = attr.getConfigFull(); // Get config1 | (config2 << 32)
u32 attrAddress = vertexBase + attr.offset + (vertexIndex * attr.size);
for (int j = 0; j < attr.componentCount; j++) {
uint index = (attrCfg >> (j * 4)) & 0xf; // Get index of attribute in vertexCfg
// Vertex attributes used as padding
// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
if (index >= 12) [[unlikely]] {
// Align attribute address up to a 4 byte boundary
attrAddress = (attrAddress + 3) & -4;
attrAddress += (index - 11) << 2;
continue;
}
u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float)
u32 size = (attribInfo >> 2) + 1; // Total number of components
// printf("vertex_attribute_strides[%d] = %d\n", attrCount, attr.size);
vec4f& attribute = currentAttributes[attrCount];
uint component; // Current component
switch (attribType) {
case 0: { // Signed byte
s8* ptr = getPointerPhys<s8>(attrAddress);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(s8);
break;
}
case 1: { // Unsigned byte
u8* ptr = getPointerPhys<u8>(attrAddress);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(u8);
break;
}
case 2: { // Short
s16* ptr = getPointerPhys<s16>(attrAddress);
for (component = 0; component < size; component++) {
float val = static_cast<float>(*ptr++);
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(s16);
break;
}
case 3: { // Float
float* ptr = getPointerPhys<float>(attrAddress);
for (component = 0; component < size; component++) {
float val = *ptr++;
attribute[component] = f24::fromFloat32(val);
}
attrAddress += size * sizeof(float);
break;
}
default: Helpers::panic("[PICA] Unimplemented attribute type %d", attribType);
}
// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
// Corgi does this although I'm not sure if it's actually needed for anything.
// TODO: Find out
while (component < 4) {
attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
component++;
}
attrCount++;
}
buffer++;
}
}
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
if constexpr (useShaderJIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();
}
PICA::Vertex& out = vertices[i];
// Map shader outputs to fixed function properties
const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
for (int i = 0; i < totalShaderOutputs; i++) {
const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
for (int j = 0; j < 4; j++) { // pls unroll
const u32 mapping = (config >> (j * 8)) & 0x1F;
out.raw[mapping] = shaderUnit.vs.outputs[i][j];
}
}
}
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
}
PICA::Vertex GPU::getImmediateModeVertex() {
PICA::Vertex v;
const int totalAttrCount = (regs[PICA::InternalRegs::VertexShaderAttrNum] & 0xf) + 1;
// Copy immediate mode attributes to vertex shader unit
for (int i = 0; i < totalAttrCount; i++) {
shaderUnit.vs.inputs[i] = immediateModeAttributes[i];
}
// Run VS and return vertex data. TODO: Don't hardcode offsets for each attribute
shaderUnit.vs.run();
std::memcpy(&v.s.positions, &shaderUnit.vs.outputs[0], sizeof(vec4f));
std::memcpy(&v.s.colour, &shaderUnit.vs.outputs[1], sizeof(vec4f));
std::memcpy(&v.s.texcoord0, &shaderUnit.vs.outputs[2], 2 * sizeof(f24));
printf(
"(x, y, z, w) = (%f, %f, %f, %f)\n", (double)v.s.positions[0], (double)v.s.positions[1], (double)v.s.positions[2], (double)v.s.positions[3]
);
printf("(r, g, b, a) = (%f, %f, %f, %f)\n", (double)v.s.colour[0], (double)v.s.colour[1], (double)v.s.colour[2], (double)v.s.colour[3]);
printf("(u, v ) = (%f, %f)\n", (double)v.s.texcoord0[0], (double)v.s.texcoord0[1]);
return v;
}
void GPU::fireDMA(u32 dest, u32 source, u32 size) {
log("[GPU] DMA of %08X bytes from %08X to %08X\n", size, source, dest);
constexpr u32 vramStart = VirtualAddrs::VramStart;
constexpr u32 vramSize = VirtualAddrs::VramSize;
const u32 fcramStart = mem.getLinearHeapVaddr();
constexpr u32 fcramSize = VirtualAddrs::FcramTotalSize;
// Shows whether this transfer is an FCRAM->VRAM transfer that's trivially optimizable
bool cpuToVRAM = true;
if (dest - vramStart >= vramSize || size > (vramSize - (dest - vramStart))) [[unlikely]] {
cpuToVRAM = false;
Helpers::panic("GPU DMA does not target VRAM");
}
if (source - fcramStart >= fcramSize || size > (fcramSize - (dest - fcramStart))) [[unlikely]] {
cpuToVRAM = false;
// Helpers::panic("GPU DMA does not have FCRAM as its source");
}
if (cpuToVRAM) [[likely]] {
// Valid, optimized FCRAM->VRAM DMA. TODO: Is VRAM->VRAM DMA allowed?
u8* fcram = mem.getFCRAM();
std::memcpy(&vram[dest - vramStart], &fcram[source - fcramStart], size);
} else {
printf("Non-trivially optimizable GPU DMA. Falling back to byte-by-byte transfer\n");
for (u32 i = 0; i < size; i++) {
mem.write8(dest + i, mem.read8(source + i));
}
}
}