#include "services/gsp_gpu.hpp" #include "PICA/regs.hpp" #include "ipc.hpp" #include "kernel.hpp" // Commands used with SendSyncRequest targetted to the GSP::GPU service namespace ServiceCommands { enum : u32 { SetAxiConfigQoSMode = 0x00100040, AcquireRight = 0x00160042, RegisterInterruptRelayQueue = 0x00130042, WriteHwRegs = 0x00010082, WriteHwRegsWithMask = 0x00020084, SetBufferSwap = 0x00050200, FlushDataCache = 0x00080082, SetLCDForceBlack = 0x000B0040, TriggerCmdReqQueue = 0x000C0000, SetInternalPriorities = 0x001E0080, StoreDataCache = 0x001F0082 }; } // Commands written to shared memory and processed by TriggerCmdReqQueue namespace GXCommands { enum : u32 { TriggerDMARequest = 0, ProcessCommandList = 1, MemoryFill = 2, TriggerDisplayTransfer = 3, TriggerTextureCopy = 4, FlushCacheRegions = 5 }; } void GPUService::reset() { privilegedProcess = 0xFFFFFFFF; // Set the privileged process to an invalid handle interruptEvent = std::nullopt; gspThreadCount = 0; sharedMem = nullptr; } void GPUService::handleSyncRequest(u32 messagePointer) { const u32 command = mem.read32(messagePointer); switch (command) { case ServiceCommands::AcquireRight: acquireRight(messagePointer); break; case ServiceCommands::FlushDataCache: flushDataCache(messagePointer); break; case ServiceCommands::RegisterInterruptRelayQueue: registerInterruptRelayQueue(messagePointer); break; case ServiceCommands::SetAxiConfigQoSMode: setAxiConfigQoSMode(messagePointer); break; case ServiceCommands::SetBufferSwap: setBufferSwap(messagePointer); break; case ServiceCommands::SetInternalPriorities: setInternalPriorities(messagePointer); break; case ServiceCommands::SetLCDForceBlack: setLCDForceBlack(messagePointer); break; case ServiceCommands::StoreDataCache: storeDataCache(messagePointer); break; case ServiceCommands::TriggerCmdReqQueue: [[likely]] triggerCmdReqQueue(messagePointer); break; case ServiceCommands::WriteHwRegs: writeHwRegs(messagePointer); break; case ServiceCommands::WriteHwRegsWithMask: writeHwRegsWithMask(messagePointer); break; default: Helpers::panic("GPU service requested. Command: %08X\n", command); } } void GPUService::acquireRight(u32 messagePointer) { const u32 flag = mem.read32(messagePointer + 4); const u32 pid = mem.read32(messagePointer + 12); log("GSP::GPU::AcquireRight (flag = %X, pid = %X)\n", flag, pid); if (flag != 0) { Helpers::panic("GSP::GPU::acquireRight with flag != 0 needs to perform additional initialization"); } if (pid == KernelHandles::CurrentProcess) { privilegedProcess = currentPID; } else { privilegedProcess = pid; } mem.write32(messagePointer, IPC::responseHeader(0x16, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } // TODO: What is the flags field meant to be? // What is the "GSP module thread index" meant to be? // How does the shared memory handle thing work? void GPUService::registerInterruptRelayQueue(u32 messagePointer) { // Detect if this function is called a 2nd time because we'll likely need to impl threads properly for the GSP if (gspThreadCount >= 1) { Helpers::panic("RegisterInterruptRelayQueue called a second time. Need to implement GSP threads properly"); } gspThreadCount += 1; const u32 flags = mem.read32(messagePointer + 4); const u32 eventHandle = mem.read32(messagePointer + 12); log("GSP::GPU::RegisterInterruptRelayQueue (flags = %X, event handle = %X)\n", flags, eventHandle); const auto event = kernel.getObject(eventHandle, KernelObjectType::Event); if (event == nullptr) { // Check if interrupt event is invalid Helpers::panic("Invalid event passed to GSP::GPU::RegisterInterruptRelayQueue"); } else { interruptEvent = eventHandle; } mem.write32(messagePointer, IPC::responseHeader(0x13, 2, 2)); mem.write32(messagePointer + 4, Result::GSP::SuccessRegisterIRQ); // First init returns a unique result mem.write32(messagePointer + 8, 0); // TODO: GSP module thread index mem.write32(messagePointer + 12, 0); // Translation descriptor mem.write32(messagePointer + 16, KernelHandles::GSPSharedMemHandle); } void GPUService::requestInterrupt(GPUInterrupt type) { // HACK: Signal DSP events on GPU interrupt for now until we have the DSP since games need DSP events // Maybe there's a better alternative? kernel.signalDSPEvents(); if (sharedMem == nullptr) [[unlikely]] { // Shared memory hasn't been set up yet return; } // TODO: Add support for multiple GSP threads u8 index = sharedMem[0]; // The interrupt block is normally located at sharedMem + processGSPIndex*0x40 u8& interruptCount = sharedMem[1]; u8 flagIndex = (index + interruptCount) % 0x34; interruptCount++; sharedMem[2] = 0; // Set error code to 0 sharedMem[0xC + flagIndex] = static_cast(type); // Write interrupt type to queue // Update framebuffer info in shared memory // Most new games check to make sure that the "flag" byte of the framebuffer info header is set to 0 // Not emulating this causes Yoshi's Wooly World, Captain Toad, Metroid 2 et al to hang if (type == GPUInterrupt::VBlank0 || type == GPUInterrupt::VBlank1) { int screen = static_cast(type) - static_cast(GPUInterrupt::VBlank0); // 0 for top screen, 1 for bottom // TODO: Offset depends on GSP thread being triggered FramebufferUpdate* update = reinterpret_cast(&sharedMem[0x200 + screen * sizeof(FramebufferUpdate)]); if (update->dirtyFlag & 1) { setBufferSwapImpl(screen, update->framebufferInfo[update->index]); update->dirtyFlag &= ~1; } } // Signal interrupt event if (interruptEvent.has_value()) { kernel.signalEvent(interruptEvent.value()); } } void GPUService::writeHwRegs(u32 messagePointer) { u32 ioAddr = mem.read32(messagePointer + 4); // GPU address based at 0x1EB00000, word aligned const u32 size = mem.read32(messagePointer + 8); // Size in bytes u32 dataPointer = mem.read32(messagePointer + 16); log("GSP::GPU::writeHwRegs (GPU address = %08X, size = %X, data address = %08X)\n", ioAddr, size, dataPointer); // Check for alignment if ((size & 3) || (ioAddr & 3) || (dataPointer & 3)) { Helpers::panic("GSP::GPU::writeHwRegs misalignment"); } if (size > 0x80) { Helpers::panic("GSP::GPU::writeHwRegs size too big"); } if (ioAddr >= 0x420000) { Helpers::panic("GSP::GPU::writeHwRegs offset too big"); } ioAddr += 0x1EB00000; for (u32 i = 0; i < size; i += 4) { const u32 value = mem.read32(dataPointer); gpu.writeReg(ioAddr, value); dataPointer += 4; ioAddr += 4; } mem.write32(messagePointer, IPC::responseHeader(0x1, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } // Update sequential GPU registers using an array of data and mask values using this formula // GPU register = (register & ~mask) | (data & mask). void GPUService::writeHwRegsWithMask(u32 messagePointer) { u32 ioAddr = mem.read32(messagePointer + 4); // GPU address based at 0x1EB00000, word aligned const u32 size = mem.read32(messagePointer + 8); // Size in bytes u32 dataPointer = mem.read32(messagePointer + 16); // Data pointer u32 maskPointer = mem.read32(messagePointer + 24); // Mask pointer log("GSP::GPU::writeHwRegsWithMask (GPU address = %08X, size = %X, data address = %08X, mask address = %08X)\n", ioAddr, size, dataPointer, maskPointer); // Check for alignment if ((size & 3) || (ioAddr & 3) || (dataPointer & 3) || (maskPointer & 3)) { Helpers::panic("GSP::GPU::writeHwRegs misalignment"); } if (size > 0x80) { Helpers::panic("GSP::GPU::writeHwRegs size too big"); } if (ioAddr >= 0x420000) { Helpers::panic("GSP::GPU::writeHwRegs offset too big"); } ioAddr += 0x1EB00000; for (u32 i = 0; i < size; i += 4) { const u32 current = gpu.readReg(ioAddr); const u32 data = mem.read32(dataPointer); const u32 mask = mem.read32(maskPointer); u32 newValue = (current & ~mask) | (data & mask); gpu.writeReg(ioAddr, newValue); maskPointer += 4; dataPointer += 4; ioAddr += 4; } mem.write32(messagePointer, IPC::responseHeader(0x2, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::flushDataCache(u32 messagePointer) { u32 address = mem.read32(messagePointer + 4); u32 size = mem.read32(messagePointer + 8); u32 processHandle = handle = mem.read32(messagePointer + 16); log("GSP::GPU::FlushDataCache(address = %08X, size = %X, process = %X)\n", address, size, processHandle); mem.write32(messagePointer, IPC::responseHeader(0x8, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::storeDataCache(u32 messagePointer) { u32 address = mem.read32(messagePointer + 4); u32 size = mem.read32(messagePointer + 8); u32 processHandle = handle = mem.read32(messagePointer + 16); log("GSP::GPU::StoreDataCache(address = %08X, size = %X, process = %X)\n", address, size, processHandle); mem.write32(messagePointer, IPC::responseHeader(0x1F, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::setLCDForceBlack(u32 messagePointer) { u32 flag = mem.read32(messagePointer + 4); log("GSP::GPU::SetLCDForceBlank(flag = %d)\n", flag); if (flag != 0) { printf("Filled both LCDs with black\n"); } mem.write32(messagePointer, IPC::responseHeader(0xB, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::triggerCmdReqQueue(u32 messagePointer) { processCommandBuffer(); mem.write32(messagePointer, IPC::responseHeader(0xC, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } // Seems to be completely undocumented, probably not very important or useful void GPUService::setAxiConfigQoSMode(u32 messagePointer) { log("GSP::GPU::SetAxiConfigQoSMode\n"); mem.write32(messagePointer, IPC::responseHeader(0x10, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::setBufferSwap(u32 messagePointer) { FramebufferInfo info{}; const u32 screenId = mem.read32(messagePointer + 4); // Selects either PDC0 or PDC1 info.activeFb = mem.read32(messagePointer + 8); info.leftFramebufferVaddr = mem.read32(messagePointer + 12); info.rightFramebufferVaddr = mem.read32(messagePointer + 16); info.stride = mem.read32(messagePointer + 20); info.format = mem.read32(messagePointer + 24); info.displayFb = mem.read32(messagePointer + 28); // Selects either framebuffer A or B log("GSP::GPU::SetBufferSwap\n"); Helpers::panic("Untested GSP::GPU::SetBufferSwap call"); setBufferSwapImpl(screenId, info); mem.write32(messagePointer, IPC::responseHeader(0x05, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } // Seems to also be completely undocumented void GPUService::setInternalPriorities(u32 messagePointer) { log("GSP::GPU::SetInternalPriorities\n"); mem.write32(messagePointer, IPC::responseHeader(0x1E, 1, 0)); mem.write32(messagePointer + 4, Result::Success); } void GPUService::processCommandBuffer() { if (sharedMem == nullptr) [[unlikely]] { // Shared memory hasn't been set up yet return; } constexpr int threadCount = 1; // TODO: More than 1 thread can have GSP commands at a time for (int t = 0; t < threadCount; t++) { u8* cmdBuffer = &sharedMem[0x800 + t * 0x200]; u8& commandsLeft = cmdBuffer[1]; // Commands start at byte 0x20 of the command buffer, each being 0x20 bytes long u32* cmd = reinterpret_cast(&cmdBuffer[0x20]); log("Processing %d GPU commands\n", commandsLeft); while (commandsLeft != 0) { const u32 cmdID = cmd[0] & 0xff; switch (cmdID) { case GXCommands::ProcessCommandList: processCommandList(cmd); break; case GXCommands::MemoryFill: memoryFill(cmd); break; case GXCommands::TriggerDisplayTransfer: triggerDisplayTransfer(cmd); break; case GXCommands::TriggerDMARequest: triggerDMARequest(cmd); break; case GXCommands::TriggerTextureCopy: triggerTextureCopy(cmd); break; case GXCommands::FlushCacheRegions: flushCacheRegions(cmd); break; default: Helpers::panic("GSP::GPU::ProcessCommands: Unknown cmd ID %d", cmdID); } commandsLeft--; } } } static u32 VaddrToPaddr(u32 addr) { if (addr >= VirtualAddrs::VramStart && addr < (VirtualAddrs::VramStart + VirtualAddrs::VramSize)) [[likely]] { return addr - VirtualAddrs::VramStart + PhysicalAddrs::VRAM; } else if (addr >= VirtualAddrs::LinearHeapStartOld && addr < VirtualAddrs::LinearHeapEndOld) { return addr - VirtualAddrs::LinearHeapStartOld + PhysicalAddrs::FCRAM; } else if (addr >= VirtualAddrs::LinearHeapStartNew && addr < VirtualAddrs::LinearHeapEndNew) { return addr - VirtualAddrs::LinearHeapStartNew + PhysicalAddrs::FCRAM; } else if (addr == 0) { return 0; } Helpers::warn("[GSP::GPU VaddrToPaddr] Unknown virtual address %08X", addr); // Obviously garbage address return 0xF3310932; } // Fill 2 GPU framebuffers, buf0 and buf1, using a specific word value void GPUService::memoryFill(u32* cmd) { u32 control = cmd[7]; // buf0 parameters u32 start0 = cmd[1]; // Start address for the fill. If 0, don't fill anything u32 value0 = cmd[2]; // Value to fill the framebuffer with u32 end0 = cmd[3]; // End address for the fill u32 control0 = control & 0xffff; // buf1 parameters u32 start1 = cmd[4]; u32 value1 = cmd[5]; u32 end1 = cmd[6]; u32 control1 = control >> 16; if (start0 != 0) { gpu.clearBuffer(VaddrToPaddr(start0), VaddrToPaddr(end0), value0, control0); requestInterrupt(GPUInterrupt::PSC0); } if (start1 != 0) { gpu.clearBuffer(VaddrToPaddr(start1), VaddrToPaddr(end1), value1, control1); requestInterrupt(GPUInterrupt::PSC1); } } void GPUService::triggerDisplayTransfer(u32* cmd) { const u32 inputAddr = VaddrToPaddr(cmd[1]); const u32 outputAddr = VaddrToPaddr(cmd[2]); const u32 inputSize = cmd[3]; const u32 outputSize = cmd[4]; const u32 flags = cmd[5]; log("GSP::GPU::TriggerDisplayTransfer (Stubbed)\n"); gpu.displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags); requestInterrupt(GPUInterrupt::PPF); // Send "Display transfer finished" interrupt } void GPUService::triggerDMARequest(u32* cmd) { const u32 source = cmd[1]; const u32 dest = cmd[2]; const u32 size = cmd[3]; const bool flush = cmd[7] == 1; log("GSP::GPU::TriggerDMARequest (source = %08X, dest = %08X, size = %08X)\n", source, dest, size); gpu.fireDMA(dest, source, size); requestInterrupt(GPUInterrupt::DMA); } void GPUService::flushCacheRegions(u32* cmd) { log("GSP::GPU::FlushCacheRegions (Stubbed)\n"); } void GPUService::setBufferSwapImpl(u32 screenId, const FramebufferInfo& info) { using namespace PICA::ExternalRegs; static constexpr std::array fbAddresses = { Framebuffer0AFirstAddr, Framebuffer0BFirstAddr, Framebuffer1AFirstAddr, Framebuffer1BFirstAddr, Framebuffer0ASecondAddr, Framebuffer0BSecondAddr, Framebuffer1ASecondAddr, Framebuffer1BSecondAddr, }; auto& regs = gpu.getExtRegisters(); const u32 fbIndex = info.activeFb * 4 + screenId * 2; regs[fbAddresses[fbIndex]] = VaddrToPaddr(info.leftFramebufferVaddr); regs[fbAddresses[fbIndex + 1]] = VaddrToPaddr(info.rightFramebufferVaddr); static constexpr std::array configAddresses = { Framebuffer0Config, Framebuffer0Select, Framebuffer0Stride, Framebuffer1Config, Framebuffer1Select, Framebuffer1Stride, }; const u32 configIndex = screenId * 3; regs[configAddresses[configIndex]] = info.format; regs[configAddresses[configIndex + 1]] = info.displayFb; regs[configAddresses[configIndex + 2]] = info.stride; } // Actually send command list (aka display list) to GPU void GPUService::processCommandList(u32* cmd) { const u32 address = cmd[1] & ~7; // Buffer address const u32 size = cmd[2] & ~3; // Buffer size in bytes [[maybe_unused]] const bool updateGas = cmd[3] == 1; // Update gas additive blend results (0 = don't update, 1 = update) [[maybe_unused]] const bool flushBuffer = cmd[7] == 1; // Flush buffer (0 = don't flush, 1 = flush) log("GPU::GSP::processCommandList. Address: %08X, size in bytes: %08X\n", address, size); gpu.startCommandList(address, size); requestInterrupt(GPUInterrupt::P3D); // Send an IRQ when command list processing is over } // TODO: Emulate the transfer engine & its registers // Then this can be emulated by just writing the appropriate values there void GPUService::triggerTextureCopy(u32* cmd) { const u32 inputAddr = VaddrToPaddr(cmd[1]); const u32 outputAddr = VaddrToPaddr(cmd[2]); const u32 totalBytes = cmd[3]; const u32 inputSize = cmd[4]; const u32 outputSize = cmd[5]; const u32 flags = cmd[6]; log("GSP::GPU::TriggerTextureCopy (Stubbed)\n"); gpu.textureCopy(inputAddr, outputAddr, totalBytes, inputSize, outputSize, flags); // This uses the transfer engine and thus needs to fire a PPF interrupt. // NSMB2 relies on this requestInterrupt(GPUInterrupt::PPF); }