mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 06:05:40 +12:00
* Renderer: Add prepareForDraw callback * Add fmt submodule and port shader decompiler instructions to it * Add shader acceleration setting * Hook up vertex shaders to shader cache * Shader decompiler: Fix redundant compilations * Shader Decompiler: Fix vertex attribute upload * Shader compiler: Simplify generated code for reading and faster compilation * Further simplify shader decompiler output * Shader decompiler: More smallen-ing * Shader decompiler: Get PICA uniforms uploaded to the GPU * Shader decompiler: Readd clipping * Shader decompiler: Actually `break` on control flow instructions * Shader decompiler: More control flow handling * Shader decompiler: Fix desitnation mask * Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) * Disgusting changes to handle the fact that hw shader shaders are 2x as big * Shader decompiler: Implement proper output semantic mapping * Moar instructions * Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI * Shader decompiler: Add register indexing * Shader decompiler: Optimize mova with both x and y masked * Shader decompiler: Add DPH/DPHI * Fix shader caching being broken * PICA decompiler: Cache VS uniforms * Simply vertex cache code * Simplify vertex cache code * Shader decompiler: Add loops * Shader decompiler: Implement safe multiplication * Shader decompiler: Implement LG2/EX2 * Shader decompiler: More control flow * Shader decompiler: Fix JMPU condition * Shader decompiler: Convert main function to void * PICA: Start implementing GPU vertex fetch * More hw VAO work * More hw VAO work * More GPU vertex fetch code * Add GL Stream Buffer from Duckstation * GL: Actually upload data to stream buffers * GPU: Cleanup immediate mode handling * Get first renders working with accelerated draws * Shader decompiler: Fix control flow analysis bugs * HW shaders: Accelerate indexed draws * Shader decompiler: Add support for compilation errors * GLSL decompiler: Fall back for LITP * Add Renderdoc scope classes * Fix control flow analysis bug * HW shaders: Fix attribute fetch * Rewriting hw vertex fetch * Stream buffer: Fix copy-paste mistake * HW shaders: Fix indexed rendering * HW shaders: Add padding attributes * HW shaders: Avoid redundant glVertexAttrib4f calls * HW shaders: Fix loops * HW shaders: Make generated shaders slightly smaller * Fix libretro build * HW shaders: Fix android * Remove redundant ubershader checks * Set accelerate shader default to true * Shader decompiler: Don't declare VS input attributes as an array * Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again * fix merge conflict bug
213 lines
7.9 KiB
C++
213 lines
7.9 KiB
C++
#pragma once
|
|
#include <array>
|
|
|
|
#include "PICA/draw_acceleration.hpp"
|
|
#include "PICA/dynapica/shader_rec.hpp"
|
|
#include "PICA/float_types.hpp"
|
|
#include "PICA/pica_vertex.hpp"
|
|
#include "PICA/regs.hpp"
|
|
#include "PICA/shader_unit.hpp"
|
|
#include "compiler_builtins.hpp"
|
|
#include "config.hpp"
|
|
#include "helpers.hpp"
|
|
#include "logger.hpp"
|
|
#include "memory.hpp"
|
|
#include "renderer.hpp"
|
|
|
|
enum class ShaderExecMode {
|
|
Interpreter, // Interpret shaders on the CPU
|
|
JIT, // Recompile shaders to CPU machine code
|
|
Hardware, // Recompiler shaders to host shaders and run them on the GPU
|
|
};
|
|
|
|
class GPU {
|
|
static constexpr u32 regNum = 0x300;
|
|
static constexpr u32 extRegNum = 0x1000;
|
|
|
|
using vec4f = std::array<Floats::f24, 4>;
|
|
using Registers = std::array<u32, regNum>; // Internal registers (named registers in short since they're the main ones)
|
|
using ExternalRegisters = std::array<u32, extRegNum>;
|
|
|
|
Memory& mem;
|
|
EmulatorConfig& config;
|
|
ShaderUnit shaderUnit;
|
|
ShaderJIT shaderJIT; // Doesn't do anything if JIT is disabled or not supported
|
|
|
|
u8* vram = nullptr;
|
|
MAKE_LOG_FUNCTION(log, gpuLogger)
|
|
|
|
static constexpr u32 maxAttribCount = 12; // Up to 12 vertex attributes
|
|
static constexpr u32 vramSize = u32(6_MB);
|
|
Registers regs; // GPU internal registers
|
|
std::array<vec4f, 16> currentAttributes; // Vertex attributes before being passed to the shader
|
|
|
|
std::array<vec4f, 16> immediateModeAttributes; // Vertex attributes uploaded via immediate mode submission
|
|
std::array<PICA::Vertex, 3> immediateModeVertices;
|
|
|
|
// Pointers for the output registers as arranged after GPUREG_VSH_OUTMAP_MASK is applied
|
|
std::array<Floats::f24*, 16> vsOutputRegisters;
|
|
// Previous value for GPUREG_VSH_OUTMAP_MASK
|
|
u32 oldVsOutputMask;
|
|
|
|
uint immediateModeVertIndex;
|
|
uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
|
|
|
|
template <bool indexed, ShaderExecMode mode>
|
|
void drawArrays();
|
|
|
|
// Silly method of avoiding linking problems. TODO: Change to something less silly
|
|
void drawArrays(bool indexed);
|
|
|
|
struct AttribInfo {
|
|
u32 offset = 0; // Offset from base vertex array
|
|
int size = 0; // Bytes per vertex
|
|
u32 config1 = 0;
|
|
u32 config2 = 0;
|
|
u32 componentCount = 0; // Number of components for the attribute
|
|
|
|
u64 getConfigFull() { return u64(config1) | (u64(config2) << 32); }
|
|
};
|
|
|
|
u64 getVertexShaderInputConfig() {
|
|
return u64(regs[PICA::InternalRegs::VertexShaderInputCfgLow]) | (u64(regs[PICA::InternalRegs::VertexShaderInputCfgHigh]) << 32);
|
|
}
|
|
|
|
std::array<AttribInfo, maxAttribCount> attributeInfo; // Info for each of the 12 attributes
|
|
u32 totalAttribCount = 0; // Number of vertex attributes to send to VS
|
|
u32 fixedAttribMask = 0; // Which attributes are fixed?
|
|
|
|
u32 fixedAttribIndex = 0; // Which fixed attribute are we writing to ([0, 11] range)
|
|
u32 fixedAttribCount = 0; // How many attribute components have we written? When we get to 4 the attr will actually get submitted
|
|
std::array<u32, 3> fixedAttrBuff; // Buffer to hold fixed attributes in until they get submitted
|
|
|
|
// Command processor pointers for GPU command lists
|
|
u32* cmdBuffStart = nullptr;
|
|
u32* cmdBuffEnd = nullptr;
|
|
u32* cmdBuffCurr = nullptr;
|
|
|
|
std::unique_ptr<Renderer> renderer;
|
|
PICA::Vertex getImmediateModeVertex();
|
|
|
|
void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
|
|
public:
|
|
// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
|
|
// Encoded in PICA native format
|
|
static constexpr size_t LightingLutSize = PICA::Lights::LUT_Count * 256;
|
|
std::array<uint32_t, LightingLutSize> lightingLUT;
|
|
|
|
// Used to prevent uploading the lighting_lut on every draw call
|
|
// Set to true when the CPU writes to the lighting_lut
|
|
// Set to false by the renderer when the lighting_lut is uploaded ot the GPU
|
|
bool lightingLUTDirty = false;
|
|
|
|
bool fogLUTDirty = false;
|
|
std::array<uint32_t, 128> fogLUT;
|
|
|
|
GPU(Memory& mem, EmulatorConfig& config);
|
|
void display() { renderer->display(); }
|
|
void screenshot(const std::string& name) { renderer->screenshot(name); }
|
|
void deinitGraphicsContext() { renderer->deinitGraphicsContext(); }
|
|
|
|
#if defined(PANDA3DS_FRONTEND_SDL)
|
|
void initGraphicsContext(SDL_Window* window) { renderer->initGraphicsContext(window); }
|
|
#elif defined(PANDA3DS_FRONTEND_QT)
|
|
void initGraphicsContext(GL::Context* context) { renderer->initGraphicsContext(context); }
|
|
#endif
|
|
|
|
void fireDMA(u32 dest, u32 source, u32 size);
|
|
void reset();
|
|
|
|
Registers& getRegisters() { return regs; }
|
|
ExternalRegisters& getExtRegisters() { return externalRegs; }
|
|
void startCommandList(u32 addr, u32 size);
|
|
|
|
// Used by the GSP GPU service for readHwRegs/writeHwRegs/writeHwRegsMasked
|
|
u32 readReg(u32 address);
|
|
void writeReg(u32 address, u32 value);
|
|
|
|
u32 readExternalReg(u32 index);
|
|
void writeExternalReg(u32 index, u32 value);
|
|
|
|
// Used when processing GPU command lists
|
|
u32 readInternalReg(u32 index);
|
|
void writeInternalReg(u32 index, u32 value, u32 mask);
|
|
|
|
// Used for setting the size of the window we'll be outputting graphics to
|
|
void setOutputSize(u32 width, u32 height) { renderer->setOutputSize(width, height); }
|
|
|
|
// TODO: Emulate the transfer engine & its registers
|
|
// Then this can be emulated by just writing the appropriate values there
|
|
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) { renderer->clearBuffer(startAddress, endAddress, value, control); }
|
|
|
|
// TODO: Emulate the transfer engine & its registers
|
|
// Then this can be emulated by just writing the appropriate values there
|
|
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {
|
|
renderer->displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
|
|
}
|
|
|
|
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {
|
|
renderer->textureCopy(inputAddr, outputAddr, totalBytes, inputSize, outputSize, flags);
|
|
}
|
|
|
|
// Read a value of type T from physical address paddr
|
|
// This is necessary because vertex attribute fetching uses physical addresses
|
|
template <typename T>
|
|
T readPhysical(u32 paddr) {
|
|
if (paddr >= PhysicalAddrs::FCRAM && paddr <= PhysicalAddrs::FCRAMEnd) {
|
|
u8* fcram = mem.getFCRAM();
|
|
u32 index = paddr - PhysicalAddrs::FCRAM;
|
|
|
|
return *(T*)&fcram[index];
|
|
} else {
|
|
Helpers::panic("[PICA] Read unimplemented paddr %08X", paddr);
|
|
}
|
|
}
|
|
|
|
// Get a pointer of type T* to the data starting from physical address paddr
|
|
template <typename T>
|
|
T* getPointerPhys(u32 paddr, u32 size = 0) {
|
|
if (paddr >= PhysicalAddrs::FCRAM && paddr + size <= PhysicalAddrs::FCRAMEnd) {
|
|
u8* fcram = mem.getFCRAM();
|
|
u32 index = paddr - PhysicalAddrs::FCRAM;
|
|
|
|
return (T*)&fcram[index];
|
|
} else if (paddr >= PhysicalAddrs::VRAM && paddr + size <= PhysicalAddrs::VRAMEnd) {
|
|
u32 index = paddr - PhysicalAddrs::VRAM;
|
|
return (T*)&vram[index];
|
|
} else [[unlikely]] {
|
|
Helpers::warn("[GPU] Tried to access unknown physical address: %08X", paddr);
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
Renderer* getRenderer() { return renderer.get(); }
|
|
private:
|
|
// GPU external registers
|
|
// We have them in the end of the struct for cache locality reasons. Tl;dr we want the more commonly used things to be packed in the start
|
|
// Of the struct, instead of externalRegs being in the middle
|
|
ExternalRegisters externalRegs;
|
|
|
|
ALWAYS_INLINE void setVsOutputMask(u32 val) {
|
|
val &= 0xffff;
|
|
|
|
// Avoid recomputing this if not necessary
|
|
if (oldVsOutputMask != val) [[unlikely]] {
|
|
oldVsOutputMask = val;
|
|
|
|
uint count = 0;
|
|
// See which registers are actually enabled and ignore the disabled ones
|
|
for (int i = 0; i < 16; i++) {
|
|
if (val & 1) {
|
|
vsOutputRegisters[count++] = &shaderUnit.vs.outputs[i][0];
|
|
}
|
|
|
|
val >>= 1;
|
|
}
|
|
|
|
// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
|
|
for (; count < 16; count++) {
|
|
vsOutputRegisters[count] = &shaderUnit.vs.outputs[count][0];
|
|
}
|
|
}
|
|
}
|
|
};
|