gpu: Implement basic texcopy

* Improves rendering in FE:A but the screens will still not show
  because it requires surface validation
This commit is contained in:
GPUCode 2023-08-06 14:25:32 +03:00
parent 297afd20d7
commit 82d9511993
15 changed files with 139 additions and 16 deletions

View file

@ -117,6 +117,10 @@ class GPU {
renderer->displayTransfer(inputAddr, outputAddr, inputSize, outputSize, flags);
}
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {
renderer->textureCopy(inputAddr, outputAddr, totalBytes, inputSize, outputSize, flags);
}
// Read a value of type T from physical address paddr
// This is necessary because vertex attribute fetching uses physical addresses
template <typename T>

View file

@ -7,6 +7,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <memory>
#include "termcolor.hpp"
@ -30,6 +31,17 @@ using s32 = std::int32_t;
using s64 = std::int64_t;
namespace Helpers {
template <class... Args>
std::string format(const std::string& fmt, Args&&... args) {
const int size = std::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
if (size <= 0) {
return {};
}
const auto buf = std::make_unique<char[]>(size);
std::snprintf(buf.get(), size, fmt.c_str(), args ...);
return std::string(buf.get(), buf.get() + size - 1);
}
// Unconditional panic, unlike panicDev which does not panic on user builds
template <class... Args>
[[noreturn]] static void panic(const char* fmt, Args&&... args) {

View file

@ -49,6 +49,7 @@ class Renderer {
virtual void initGraphicsContext(SDL_Window* window) = 0; // Initialize graphics context
virtual void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) = 0; // Clear a GPU buffer in VRAM
virtual void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) = 0; // Perform display transfer
virtual void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) = 0;
virtual void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) = 0; // Draw the given vertices
virtual void screenshot(const std::string& name) = 0;

View file

@ -77,9 +77,10 @@ class RendererGL final : public Renderer {
void initGraphicsContext(SDL_Window* window) override; // Initialize graphics context
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override; // Clear a GPU buffer in VRAM
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override; // Perform display transfer
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override; // Draw the given vertices
ColourBuffer getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height);
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
// Take a screenshot of the screen and store it in a file
void screenshot(const std::string& name) override;

View file

@ -76,6 +76,16 @@ public:
size++;
// Find an existing surface we completely invalidate and overwrite it with the new surface
for (auto& e : buffer) {
if (e.valid && e.range.lower() >= surface.range.lower() && e.range.upper() <= surface.range.upper()) {
e.free();
e = surface;
e.allocate();
return e;
}
}
// Find an invalid entry in the cache and overwrite it with the new surface
for (auto& e : buffer) {
if (!e.valid) {

View file

@ -65,7 +65,8 @@ struct ColourBuffer {
texture.setMagFilter(OpenGL::Linear);
glBindTexture(GL_TEXTURE_2D, prevTexture);
//Helpers::panic("Creating FBO: %d, %d\n", size.x(), size.y());
const auto name = Helpers::format("Surface %dx%d %s from 0x%08X", size.x(), size.y(), PICA::textureFormatToString(format), location);
OpenGL::setObjectLabel(GL_TEXTURE, texture.handle(), name.c_str());
fbo.createWithDrawTexture(texture);
fbo.bind(OpenGL::DrawAndReadFramebuffer);

View file

@ -12,6 +12,7 @@ class RendererNull final : public Renderer {
void initGraphicsContext(SDL_Window* window) override;
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
void screenshot(const std::string& name) override;
};

View file

@ -12,6 +12,7 @@ class RendererSw final : public Renderer {
void initGraphicsContext(SDL_Window* window) override;
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
void screenshot(const std::string& name) override;
};
};

View file

@ -52,6 +52,7 @@ class RendererVK final : public Renderer {
void initGraphicsContext(SDL_Window* window) override;
void clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) override;
void displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) override;
void textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) override;
void drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) override;
void screenshot(const std::string& name) override;
};
};

View file

@ -599,12 +599,11 @@ void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u
u32 outputWidth = outputSize & 0xffff;
u32 outputHeight = outputSize >> 16;
if (inputWidth != outputWidth) {
// Helpers::warn("Strided display transfer is not handled correctly!\n");
}
OpenGL::DebugScope scope("DisplayTransfer inputAddr 0x%08X outputAddr 0x%08X inputWidth %d outputWidth %d inputWidth %d outputHeight %d",
inputAddr, outputAddr, inputWidth, outputWidth, inputHeight, outputHeight);
auto srcFramebuffer = getColourBuffer(inputAddr, inputFormat, inputWidth, inputHeight);
Math::Rect<u32> srcRect = srcFramebuffer.getSubRect(inputAddr, outputWidth, outputHeight);
auto srcFramebuffer = getColourBuffer(inputAddr, inputFormat, inputWidth, outputHeight);
Math::Rect<u32> srcRect = srcFramebuffer->getSubRect(inputAddr, outputWidth, outputHeight);
// Apply scaling for the destination rectangle.
if (scaling == PICA::Scaling::X || scaling == PICA::Scaling::XY) {
@ -616,18 +615,88 @@ void RendererGL::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u
}
auto destFramebuffer = getColourBuffer(outputAddr, outputFormat, outputWidth, outputHeight);
Math::Rect<u32> destRect = destFramebuffer.getSubRect(outputAddr, outputWidth, outputHeight);
Math::Rect<u32> destRect = destFramebuffer->getSubRect(outputAddr, outputWidth, outputHeight);
if (inputWidth != outputWidth) {
// Helpers::warn("Strided display transfer is not handled correctly!\n");
}
// Blit the framebuffers
srcFramebuffer.fbo.bind(OpenGL::ReadFramebuffer);
destFramebuffer.fbo.bind(OpenGL::DrawFramebuffer);
srcFramebuffer->fbo.bind(OpenGL::ReadFramebuffer);
destFramebuffer->fbo.bind(OpenGL::DrawFramebuffer);
glBlitFramebuffer(
srcRect.left, srcRect.top, srcRect.right, srcRect.bottom, destRect.left, destRect.top, destRect.right, destRect.bottom, GL_COLOR_BUFFER_BIT,
GL_LINEAR
);
}
ColourBuffer RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height) {
void RendererGL::textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {
// Texture copy size is aligned to 16 byte units
const u32 copySize = totalBytes & ~0xf;
if (copySize == 0) {
printf("TextureCopy total bytes less than 16!\n");
return;
}
// The width and gap are provided in 16-byte units.
const u32 inputWidth = (inputSize & 0xffff) << 4;
const u32 inputGap = (inputSize >> 16) << 4;
const u32 outputWidth = (outputSize & 0xffff) << 4;
const u32 outputGap = (outputSize >> 16) << 4;
OpenGL::DebugScope scope("TextureCopy inputAddr 0x%08X outputAddr 0x%08X totalBytes %d inputWidth %d inputGap %d outputWidth %d outputGap %d",
inputAddr, outputAddr, totalBytes, inputWidth, inputGap, outputWidth, outputGap);
if (inputGap != 0 || outputGap != 0) {
Helpers::warn("Strided texture copy\n");
}
if (inputWidth != outputWidth) {
Helpers::warn("Input width does not match output width, cannot accelerate texture copy!\n");
return;
}
// Texture copy is a raw data copy in PICA, which means no format or tiling information is provided to the engine.
// Depending if the target surface is linear or tiled, games set inputWidth to either the width of the texture or
// the width multiplied by eight (because tiles are stored linearly in memory).
// To properly accelerate this we must examine each surface individually. For now we assume the most common case
// of tiled surface with RGBA8 format. If our assumption does not hold true, we abort the texture copy as inserting
// that surface is not correct.
// We assume the source surface is tiled and RGBA8. inputWidth is in bytes so divide it
// by eight * sizePerPixel(RGBA8) to convert it to a useable width.
const u32 bpp = sizePerPixel(PICA::ColorFmt::RGBA8);
const u32 copyStride = (inputWidth + inputGap) / (8 * bpp);
const u32 copyWidth = inputWidth / (8 * bpp);
// inputHeight/outputHeight are typically set to zero so they cannot be used to get the height of the copy region
// in contrast to display transfer. Compute height manually by dividing the copy size with the copy width. The result
// is the number of vertical tiles so multiply that by eight to get the actual copy height.
const u32 copyHeight = (copySize / inputWidth) * 8;
// Find the source surface.
auto srcFramebuffer = getColourBuffer(inputAddr, PICA::ColorFmt::RGBA8, copyStride, copyHeight, false);
if (!srcFramebuffer) {
printf("TextureCopy failed to locate src framebuffer!\n");
return;
}
Math::Rect<u32> srcRect = srcFramebuffer->getSubRect(inputAddr, copyWidth, copyHeight);
// Assume the destination surface has the same format. Unless the surfaces have the same block width,
// texture copy does not make sense.
auto destFramebuffer = getColourBuffer(outputAddr, srcFramebuffer->format, copyWidth, copyHeight);
Math::Rect<u32> destRect = destFramebuffer->getSubRect(outputAddr, copyWidth, copyHeight);
// Blit the framebuffers
srcFramebuffer->fbo.bind(OpenGL::ReadFramebuffer);
destFramebuffer->fbo.bind(OpenGL::DrawFramebuffer);
glBlitFramebuffer(
srcRect.left, srcRect.top, srcRect.right, srcRect.bottom, destRect.left, destRect.top, destRect.right, destRect.bottom, GL_COLOR_BUFFER_BIT,
GL_LINEAR
);
}
std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound) {
// Try to find an already existing buffer that contains the provided address
// This is a more relaxed check compared to getColourFBO as display transfer/texcopy may refer to
// subrect of a surface and in case of texcopy we don't know the format of the surface.
@ -636,6 +705,10 @@ ColourBuffer RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt format, u32 wi
return buffer.value().get();
}
if (!createIfnotFound) {
return std::nullopt;
}
// Otherwise create and cache a new buffer.
ColourBuffer sampleBuffer(addr, format, width, height);
return colourBufferCache.add(sampleBuffer);

View file

@ -9,6 +9,9 @@ void Texture::allocate() {
texture.create(size.u(), size.v(), GL_RGBA8);
texture.bind();
const auto name = Helpers::format("Surface %dx%d %s from 0x%08X", size.x(), size.y(), PICA::textureFormatToString(format), location);
OpenGL::setObjectLabel(GL_TEXTURE, texture.handle(), name.c_str());
setNewConfig(config);
}

View file

@ -9,5 +9,6 @@ void RendererNull::display() {}
void RendererNull::initGraphicsContext(SDL_Window* window) {}
void RendererNull::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 control) {}
void RendererNull::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {}
void RendererNull::textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {}
void RendererNull::drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) {}
void RendererNull::screenshot(const std::string& name) {}

View file

@ -14,8 +14,12 @@ void RendererSw::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u
printf("RendererSW: Unimplemented displayTransfer call\n");
}
void RendererSw::textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {
printf("RendererSW: Unimplemented textureCopy call\n");
}
void RendererSw::drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) {
printf("RendererSW: Unimplemented drawVertices call\n");
}
void RendererSw::screenshot(const std::string& name) { printf("RendererSW: Unimplemented screenshot call\n"); }
void RendererSw::screenshot(const std::string& name) { printf("RendererSW: Unimplemented screenshot call\n"); }

View file

@ -542,6 +542,8 @@ void RendererVK::clearBuffer(u32 startAddress, u32 endAddress, u32 value, u32 co
void RendererVK::displayTransfer(u32 inputAddr, u32 outputAddr, u32 inputSize, u32 outputSize, u32 flags) {}
void RendererVK::textureCopy(u32 inputAddr, u32 outputAddr, u32 totalBytes, u32 inputSize, u32 outputSize, u32 flags) {}
void RendererVK::drawVertices(PICA::PrimType primType, std::span<const PICA::Vertex> vertices) {}
void RendererVK::screenshot(const std::string& name) {}
void RendererVK::screenshot(const std::string& name) {}

View file

@ -441,7 +441,15 @@ void GPUService::processCommandList(u32* cmd) {
// TODO: Emulate the transfer engine & its registers
// Then this can be emulated by just writing the appropriate values there
void GPUService::triggerTextureCopy(u32* cmd) {
Helpers::warn("GSP::GPU::TriggerTextureCopy (unimplemented)\n");
const u32 inputAddr = VaddrToPaddr(cmd[1]);
const u32 outputAddr = VaddrToPaddr(cmd[2]);
const u32 totalBytes = cmd[3];
const u32 inputSize = cmd[4];
const u32 outputSize = cmd[5];
const u32 flags = cmd[6];
log("GSP::GPU::TriggerTextureCopy (Stubbed)\n");
gpu.textureCopy(inputAddr, outputAddr, totalBytes, inputSize, outputSize, flags);
// This uses the transfer engine and thus needs to fire a PPF interrupt.
// NSMB2 relies on this
requestInterrupt(GPUInterrupt::PPF);