mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 06:05:40 +12:00
Moar shader decompiler (#559)
* Renderer: Add prepareForDraw callback * Add fmt submodule and port shader decompiler instructions to it * Add shader acceleration setting * Hook up vertex shaders to shader cache * Shader decompiler: Fix redundant compilations * Shader Decompiler: Fix vertex attribute upload * Shader compiler: Simplify generated code for reading and faster compilation * Further simplify shader decompiler output * Shader decompiler: More smallen-ing * Shader decompiler: Get PICA uniforms uploaded to the GPU * Shader decompiler: Readd clipping * Shader decompiler: Actually `break` on control flow instructions * Shader decompiler: More control flow handling * Shader decompiler: Fix desitnation mask * Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) * Disgusting changes to handle the fact that hw shader shaders are 2x as big * Shader decompiler: Implement proper output semantic mapping * Moar instructions * Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI * Shader decompiler: Add register indexing * Shader decompiler: Optimize mova with both x and y masked * Shader decompiler: Add DPH/DPHI * Fix shader caching being broken * PICA decompiler: Cache VS uniforms * Simply vertex cache code * Simplify vertex cache code * Shader decompiler: Add loops * Shader decompiler: Implement safe multiplication * Shader decompiler: Implement LG2/EX2 * Shader decompiler: More control flow * Shader decompiler: Fix JMPU condition * Shader decompiler: Convert main function to void * PICA: Start implementing GPU vertex fetch * More hw VAO work * More hw VAO work * More GPU vertex fetch code * Add GL Stream Buffer from Duckstation * GL: Actually upload data to stream buffers * GPU: Cleanup immediate mode handling * Get first renders working with accelerated draws * Shader decompiler: Fix control flow analysis bugs * HW shaders: Accelerate indexed draws * Shader decompiler: Add support for compilation errors * GLSL decompiler: Fall back for LITP * Add Renderdoc scope classes * Fix control flow analysis bug * HW shaders: Fix attribute fetch * Rewriting hw vertex fetch * Stream buffer: Fix copy-paste mistake * HW shaders: Fix indexed rendering * HW shaders: Add padding attributes * HW shaders: Avoid redundant glVertexAttrib4f calls * HW shaders: Fix loops * HW shaders: Make generated shaders slightly smaller * Fix libretro build * HW shaders: Fix android * Remove redundant ubershader checks * Set accelerate shader default to true * Shader decompiler: Don't declare VS input attributes as an array * Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again * fix merge conflict bug
This commit is contained in:
parent
afaf18f124
commit
49a94a13c5
34 changed files with 1877 additions and 253 deletions
4
.github/workflows/Android_Build.yml
vendored
4
.github/workflows/Android_Build.yml
vendored
|
@ -8,7 +8,7 @@ on:
|
|||
|
||||
jobs:
|
||||
x64:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -73,7 +73,7 @@ jobs:
|
|||
./src/pandroid/app/build/outputs/apk/${{ env.BUILD_TYPE }}/app-${{ env.BUILD_TYPE }}.apk
|
||||
|
||||
arm64:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
|
2
.github/workflows/HTTP_Build.yml
vendored
2
.github/workflows/HTTP_Build.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
# well on Windows or Mac. You can convert this to a matrix build if you need
|
||||
# cross-platform coverage.
|
||||
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
4
.github/workflows/Hydra_Build.yml
vendored
4
.github/workflows/Hydra_Build.yml
vendored
|
@ -98,7 +98,7 @@ jobs:
|
|||
${{github.workspace}}/docs/libretro/panda3ds_libretro.info
|
||||
|
||||
Linux:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -151,7 +151,7 @@ jobs:
|
|||
${{github.workspace}}/docs/libretro/panda3ds_libretro.info
|
||||
|
||||
Android-x64:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
2
.github/workflows/Linux_AppImage_Build.yml
vendored
2
.github/workflows/Linux_AppImage_Build.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
# well on Windows or Mac. You can convert this to a matrix build if you need
|
||||
# cross-platform coverage.
|
||||
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
2
.github/workflows/Linux_Build.yml
vendored
2
.github/workflows/Linux_Build.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
# well on Windows or Mac. You can convert this to a matrix build if you need
|
||||
# cross-platform coverage.
|
||||
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
2
.github/workflows/Qt_Build.yml
vendored
2
.github/workflows/Qt_Build.yml
vendored
|
@ -96,7 +96,7 @@ jobs:
|
|||
path: 'Alber.zip'
|
||||
|
||||
Linux:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -76,6 +76,9 @@
|
|||
[submodule "third_party/metal-cpp"]
|
||||
path = third_party/metal-cpp
|
||||
url = https://github.com/Panda3DS-emu/metal-cpp
|
||||
[submodule "third_party/fmt"]
|
||||
path = third_party/fmt
|
||||
url = https://github.com/fmtlib/fmt
|
||||
[submodule "third_party/fdk-aac"]
|
||||
path = third_party/fdk-aac
|
||||
url = https://github.com/Panda3DS-emu/fdk-aac/
|
||||
|
|
|
@ -146,11 +146,13 @@ if (NOT ANDROID)
|
|||
target_link_libraries(AlberCore PUBLIC SDL2-static)
|
||||
endif()
|
||||
|
||||
add_subdirectory(third_party/fmt)
|
||||
add_subdirectory(third_party/toml11)
|
||||
include_directories(${SDL2_INCLUDE_DIR})
|
||||
include_directories(third_party/toml11)
|
||||
include_directories(third_party/glm)
|
||||
include_directories(third_party/renderdoc)
|
||||
include_directories(third_party/duckstation)
|
||||
|
||||
add_subdirectory(third_party/cmrc)
|
||||
|
||||
|
@ -263,7 +265,7 @@ set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA
|
|||
src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
|
||||
src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
|
||||
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
|
||||
src/core/PICA/shader_decompiler.cpp
|
||||
src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
|
||||
)
|
||||
|
||||
set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
|
||||
|
@ -315,7 +317,8 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
|
|||
include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
|
||||
include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
|
||||
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
|
||||
include/sdl_sensors.hpp include/renderdoc.hpp include/audio/aac_decoder.hpp
|
||||
include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
|
||||
include/align.hpp include/audio/aac_decoder.hpp
|
||||
)
|
||||
|
||||
cmrc_add_resource_library(
|
||||
|
@ -348,7 +351,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
|
|||
endif()
|
||||
|
||||
if(ENABLE_QT_GUI)
|
||||
include_directories(third_party/duckstation)
|
||||
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)
|
||||
|
||||
if(APPLE)
|
||||
|
@ -391,6 +393,8 @@ if(ENABLE_OPENGL)
|
|||
src/host_shaders/opengl_fragment_shader.frag
|
||||
)
|
||||
|
||||
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
|
||||
|
||||
set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
|
||||
source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
|
||||
|
||||
|
@ -480,7 +484,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN
|
|||
target_sources(AlberCore PRIVATE ${ALL_SOURCES})
|
||||
|
||||
target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra fdk-aac)
|
||||
target_link_libraries(AlberCore PUBLIC glad capstone)
|
||||
target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)
|
||||
|
||||
if(ENABLE_DISCORD_RPC AND NOT ANDROID)
|
||||
target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")
|
||||
|
|
45
include/PICA/draw_acceleration.hpp
Normal file
45
include/PICA/draw_acceleration.hpp
Normal file
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "helpers.hpp"
|
||||
|
||||
namespace PICA {
|
||||
struct DrawAcceleration {
|
||||
static constexpr u32 maxAttribCount = 16;
|
||||
static constexpr u32 maxLoaderCount = 12;
|
||||
|
||||
struct AttributeInfo {
|
||||
u32 offset;
|
||||
u32 stride;
|
||||
|
||||
u8 type;
|
||||
u8 componentCount;
|
||||
|
||||
std::array<float, 4> fixedValue; // For fixed attributes
|
||||
};
|
||||
|
||||
struct Loader {
|
||||
// Data to upload for this loader
|
||||
u8* data;
|
||||
usize size;
|
||||
};
|
||||
|
||||
u8* indexBuffer;
|
||||
|
||||
// Minimum and maximum index in the index buffer for a draw call
|
||||
u16 minimumIndex, maximumIndex;
|
||||
u32 totalAttribCount;
|
||||
u32 totalLoaderCount;
|
||||
u32 enabledAttributeMask;
|
||||
u32 fixedAttributes;
|
||||
u32 vertexDataSize;
|
||||
|
||||
std::array<AttributeInfo, maxAttribCount> attributeInfo;
|
||||
std::array<Loader, maxLoaderCount> loaders;
|
||||
|
||||
bool canBeAccelerated;
|
||||
bool indexed;
|
||||
bool useShortIndices;
|
||||
};
|
||||
} // namespace PICA
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
|
||||
#include "PICA/draw_acceleration.hpp"
|
||||
#include "PICA/dynapica/shader_rec.hpp"
|
||||
#include "PICA/float_types.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
|
@ -13,6 +14,12 @@
|
|||
#include "memory.hpp"
|
||||
#include "renderer.hpp"
|
||||
|
||||
enum class ShaderExecMode {
|
||||
Interpreter, // Interpret shaders on the CPU
|
||||
JIT, // Recompile shaders to CPU machine code
|
||||
Hardware, // Recompiler shaders to host shaders and run them on the GPU
|
||||
};
|
||||
|
||||
class GPU {
|
||||
static constexpr u32 regNum = 0x300;
|
||||
static constexpr u32 extRegNum = 0x1000;
|
||||
|
@ -45,7 +52,7 @@ class GPU {
|
|||
uint immediateModeVertIndex;
|
||||
uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
|
||||
|
||||
template <bool indexed, bool useShaderJIT>
|
||||
template <bool indexed, ShaderExecMode mode>
|
||||
void drawArrays();
|
||||
|
||||
// Silly method of avoiding linking problems. TODO: Change to something less silly
|
||||
|
@ -81,6 +88,7 @@ class GPU {
|
|||
std::unique_ptr<Renderer> renderer;
|
||||
PICA::Vertex getImmediateModeVertex();
|
||||
|
||||
void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
|
||||
public:
|
||||
// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
|
||||
// Encoded in PICA native format
|
||||
|
|
57
include/PICA/pica_vert_config.hpp
Normal file
57
include/PICA/pica_vert_config.hpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "PICA/pica_hash.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader.hpp"
|
||||
#include "bitfield.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
||||
namespace PICA {
|
||||
// Configuration struct used
|
||||
struct VertConfig {
|
||||
PICAHash::HashType shaderHash;
|
||||
PICAHash::HashType opdescHash;
|
||||
u32 entrypoint;
|
||||
|
||||
// PICA registers for configuring shader output->fragment semantic mapping
|
||||
std::array<u32, 7> outmaps{};
|
||||
u16 outputMask;
|
||||
u8 outputCount;
|
||||
bool usingUbershader;
|
||||
|
||||
// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
|
||||
// As the padding will get hashed and memcmp'd...
|
||||
u32 pad{};
|
||||
|
||||
bool operator==(const VertConfig& config) const {
|
||||
// Hash function and equality operator required by std::unordered_map
|
||||
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
|
||||
}
|
||||
|
||||
VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
|
||||
shaderHash = shader.getCodeHash();
|
||||
opdescHash = shader.getOpdescHash();
|
||||
entrypoint = shader.entrypoint;
|
||||
|
||||
outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||
outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
|
||||
for (int i = 0; i < outputCount; i++) {
|
||||
// Mask out unused bits
|
||||
outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace PICA
|
||||
|
||||
static_assert(sizeof(PICA::VertConfig) == 56);
|
||||
|
||||
// Override std::hash for our vertex config class
|
||||
template <>
|
||||
struct std::hash<PICA::VertConfig> {
|
||||
std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
|
||||
};
|
|
@ -107,6 +107,11 @@ class PICAShader {
|
|||
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
alignas(16) std::array<vec4f, 16> outputs;
|
||||
alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT
|
||||
|
||||
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
|
||||
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
|
||||
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
|
||||
using Hash = PICAHash::HashType;
|
||||
|
||||
protected:
|
||||
std::array<u32, 128> operandDescriptors;
|
||||
|
@ -125,14 +130,13 @@ class PICAShader {
|
|||
std::array<CallInfo, 4> callInfo;
|
||||
ShaderType type;
|
||||
|
||||
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
|
||||
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
|
||||
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
|
||||
using Hash = PICAHash::HashType;
|
||||
|
||||
Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
|
||||
Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT)
|
||||
|
||||
public:
|
||||
bool uniformsDirty = false;
|
||||
|
||||
protected:
|
||||
bool codeHashDirty = false;
|
||||
bool opdescHashDirty = false;
|
||||
|
||||
|
@ -284,6 +288,7 @@ class PICAShader {
|
|||
uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
|
||||
uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
|
||||
}
|
||||
uniformsDirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -295,6 +300,12 @@ class PICAShader {
|
|||
u[1] = getBits<8, 8>(word);
|
||||
u[2] = getBits<16, 8>(word);
|
||||
u[3] = getBits<24, 8>(word);
|
||||
uniformsDirty = true;
|
||||
}
|
||||
|
||||
void uploadBoolUniform(u32 value) {
|
||||
boolUniform = value;
|
||||
uniformsDirty = true;
|
||||
}
|
||||
|
||||
void run();
|
||||
|
@ -302,6 +313,10 @@ class PICAShader {
|
|||
|
||||
Hash getCodeHash();
|
||||
Hash getOpdescHash();
|
||||
|
||||
// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
|
||||
static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
|
||||
void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
|
||||
};
|
||||
|
||||
static_assert(
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
#pragma once
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "PICA/shader.hpp"
|
||||
|
@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
|
|||
explicit Function(u32 start, u32 end) : start(start), end(end) {}
|
||||
bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
|
||||
|
||||
std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
|
||||
std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
|
||||
std::string getCallStatement() const { return getIdentifier() + "()"; }
|
||||
std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
|
||||
// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
|
||||
// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
|
||||
// from within functions deep in the callstack
|
||||
std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
|
||||
std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
|
||||
};
|
||||
|
||||
std::set<Function> functions{};
|
||||
|
@ -93,9 +99,11 @@ namespace PICA::ShaderGen {
|
|||
|
||||
API api;
|
||||
Language language;
|
||||
bool compilationError = false;
|
||||
|
||||
void compileInstruction(u32& pc, bool& finished);
|
||||
void compileRange(const AddressRange& range);
|
||||
// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
|
||||
std::pair<u32, bool> compileRange(const AddressRange& range);
|
||||
void callFunction(const Function& function);
|
||||
const Function* findFunction(const AddressRange& range);
|
||||
|
||||
|
@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
|
|||
std::string getDest(u32 dest) const;
|
||||
std::string getSwizzlePattern(u32 swizzle) const;
|
||||
std::string getDestSwizzle(u32 destinationMask) const;
|
||||
const char* getCondition(u32 cond, u32 refX, u32 refY);
|
||||
|
||||
void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
|
||||
// Returns if the instruction uses the typical register encodings most instructions use
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "PICA/gpu.hpp"
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/pica_vert_config.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen_types.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -31,6 +32,8 @@ namespace PICA::ShaderGen {
|
|||
FragmentGenerator(API api, Language language) : api(api), language(language) {}
|
||||
std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
|
||||
std::string getDefaultVertexShader();
|
||||
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
|
||||
std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
|
||||
|
||||
void setTarget(API api, Language language) {
|
||||
this->api = api;
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
#include "PICA/shader.hpp"
|
||||
|
||||
class ShaderUnit {
|
||||
|
||||
public:
|
||||
PICAShader vs; // Vertex shader
|
||||
PICAShader gs; // Geometry shader
|
||||
public:
|
||||
PICAShader vs; // Vertex shader
|
||||
PICAShader gs; // Geometry shader
|
||||
|
||||
ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
|
||||
void reset();
|
||||
|
|
99
include/align.hpp
Normal file
99
include/align.hpp
Normal file
|
@ -0,0 +1,99 @@
|
|||
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "helpers.hpp"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
namespace Common {
|
||||
template <typename T>
|
||||
constexpr bool isAligned(T value, unsigned int alignment) {
|
||||
return (value % static_cast<T>(alignment)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignUp(T value, unsigned int alignment) {
|
||||
return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignDown(T value, unsigned int alignment) {
|
||||
return value / static_cast<T>(alignment) * static_cast<T>(alignment);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool isAlignedPow2(T value, unsigned int alignment) {
|
||||
return (value & static_cast<T>(alignment - 1)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignUpPow2(T value, unsigned int alignment) {
|
||||
return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignDownPow2(T value, unsigned int alignment) {
|
||||
return value & static_cast<T>(~static_cast<T>(alignment - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool isPow2(T value) {
|
||||
return (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T previousPow2(T value) {
|
||||
if (value == static_cast<T>(0)) return 0;
|
||||
|
||||
value |= (value >> 1);
|
||||
value |= (value >> 2);
|
||||
value |= (value >> 4);
|
||||
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
|
||||
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
|
||||
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
|
||||
return value - (value >> 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T nextPow2(T value) {
|
||||
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
|
||||
if (value == static_cast<T>(0)) return 0;
|
||||
|
||||
value--;
|
||||
value |= (value >> 1);
|
||||
value |= (value >> 2);
|
||||
value |= (value >> 4);
|
||||
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
|
||||
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
|
||||
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
|
||||
value++;
|
||||
return value;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
|
||||
#ifdef _MSC_VER
|
||||
return _aligned_malloc(size, alignment);
|
||||
#else
|
||||
// Unaligned sizes are slow on macOS.
|
||||
#ifdef __APPLE__
|
||||
if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
|
||||
#endif
|
||||
void* ret = nullptr;
|
||||
return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void alignedFree(void* ptr) {
|
||||
#ifdef _MSC_VER
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
} // namespace Common
|
|
@ -20,11 +20,13 @@ struct EmulatorConfig {
|
|||
#else
|
||||
static constexpr bool ubershaderDefault = true;
|
||||
#endif
|
||||
|
||||
static constexpr bool accelerateShadersDefault = true;
|
||||
|
||||
bool shaderJitEnabled = shaderJitDefault;
|
||||
bool discordRpcEnabled = false;
|
||||
bool useUbershaders = ubershaderDefault;
|
||||
bool accelerateShaders = accelerateShadersDefault;
|
||||
bool accurateShaderMul = false;
|
||||
bool discordRpcEnabled = false;
|
||||
|
||||
// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
|
||||
bool forceShadergenForLights = true;
|
||||
|
|
|
@ -35,4 +35,35 @@ namespace Renderdoc {
|
|||
static void setOutputDir(const std::string& path, const std::string& prefix) {}
|
||||
static constexpr bool isSupported() { return false; }
|
||||
} // namespace Renderdoc
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace Renderdoc {
|
||||
// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
|
||||
struct Scope {
|
||||
Scope() { Renderdoc::startCapture(); }
|
||||
~Scope() { Renderdoc::endCapture(); }
|
||||
|
||||
Scope(const Scope&) = delete;
|
||||
Scope& operator=(const Scope&) = delete;
|
||||
|
||||
Scope(Scope&&) = delete;
|
||||
Scope& operator=(const Scope&&) = delete;
|
||||
};
|
||||
|
||||
// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
|
||||
// trigger on its own and take a capture
|
||||
struct InstantScope {
|
||||
InstantScope() {
|
||||
Renderdoc::triggerCapture();
|
||||
Renderdoc::startCapture();
|
||||
}
|
||||
|
||||
~InstantScope() { Renderdoc::endCapture(); }
|
||||
|
||||
InstantScope(const InstantScope&) = delete;
|
||||
InstantScope& operator=(const InstantScope&) = delete;
|
||||
|
||||
InstantScope(InstantScope&&) = delete;
|
||||
InstantScope& operator=(const InstantScope&&) = delete;
|
||||
};
|
||||
} // namespace Renderdoc
|
|
@ -1,9 +1,10 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <string>
|
||||
#include <optional>
|
||||
|
||||
#include "PICA/draw_acceleration.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -21,9 +22,11 @@ enum class RendererType : s8 {
|
|||
};
|
||||
|
||||
struct EmulatorConfig;
|
||||
class GPU;
|
||||
struct SDL_Window;
|
||||
|
||||
class GPU;
|
||||
class ShaderUnit;
|
||||
|
||||
class Renderer {
|
||||
protected:
|
||||
GPU& gpu;
|
||||
|
@ -77,7 +80,11 @@ class Renderer {
|
|||
virtual std::string getUbershader() { return ""; }
|
||||
virtual void setUbershader(const std::string& shader) {}
|
||||
|
||||
virtual void setUbershaderSetting(bool value) {}
|
||||
// This function is called on every draw call before parsing vertex data.
|
||||
// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
|
||||
// ubershaders and shadergen, and so on.
|
||||
// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
|
||||
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
|
||||
|
||||
// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
|
||||
#ifdef PANDA3DS_FRONTEND_QT
|
||||
|
|
|
@ -38,7 +38,6 @@ struct GLStateManager {
|
|||
|
||||
GLuint stencilMask;
|
||||
GLuint boundVAO;
|
||||
GLuint boundVBO;
|
||||
GLuint currentProgram;
|
||||
GLuint boundUBO;
|
||||
|
||||
|
@ -173,13 +172,6 @@ struct GLStateManager {
|
|||
}
|
||||
}
|
||||
|
||||
void bindVBO(GLuint handle) {
|
||||
if (boundVBO != handle) {
|
||||
boundVBO = handle;
|
||||
glBindBuffer(GL_ARRAY_BUFFER, handle);
|
||||
}
|
||||
}
|
||||
|
||||
void useProgram(GLuint handle) {
|
||||
if (currentProgram != handle) {
|
||||
currentProgram = handle;
|
||||
|
@ -195,7 +187,6 @@ struct GLStateManager {
|
|||
}
|
||||
|
||||
void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
|
||||
void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
|
||||
void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }
|
||||
|
||||
void setColourMask(bool r, bool g, bool b, bool a) {
|
||||
|
|
|
@ -3,15 +3,20 @@
|
|||
#include <array>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "PICA/float_types.hpp"
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/pica_hash.hpp"
|
||||
#include "PICA/pica_vert_config.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen.hpp"
|
||||
#include "gl/stream_buffer.h"
|
||||
#include "gl_driver.hpp"
|
||||
#include "gl_state.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -29,9 +34,11 @@ class RendererGL final : public Renderer {
|
|||
OpenGL::Program triangleProgram;
|
||||
OpenGL::Program displayProgram;
|
||||
|
||||
OpenGL::VertexArray vao;
|
||||
// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
|
||||
OpenGL::VertexArray defaultVAO;
|
||||
// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
|
||||
OpenGL::VertexArray hwShaderVAO;
|
||||
OpenGL::VertexBuffer vbo;
|
||||
bool enableUbershader = true;
|
||||
|
||||
// Data
|
||||
struct {
|
||||
|
@ -54,6 +61,21 @@ class RendererGL final : public Renderer {
|
|||
float oldDepthScale = -1.0;
|
||||
float oldDepthOffset = 0.0;
|
||||
bool oldDepthmapEnable = false;
|
||||
// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
|
||||
bool usingAcceleratedShader = false;
|
||||
bool performIndexedRender = false;
|
||||
bool usingShortIndices = false;
|
||||
|
||||
// Set by prepareForDraw, metadata for indexed renders
|
||||
GLuint minimumIndex = 0;
|
||||
GLuint maximumIndex = 0;
|
||||
void* hwIndexBufferOffset = nullptr;
|
||||
|
||||
// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
|
||||
u32 previousAttributeMask = 0;
|
||||
|
||||
// Cached pointer to the current vertex shader when using HW accelerated shaders
|
||||
OpenGL::Shader* generatedVertexShader = nullptr;
|
||||
|
||||
SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
|
||||
SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
|
||||
|
@ -71,12 +93,51 @@ class RendererGL final : public Renderer {
|
|||
// We can compile this once and then link it with all other generated fragment shaders
|
||||
OpenGL::Shader defaultShadergenVs;
|
||||
GLuint shadergenFragmentUBO;
|
||||
// UBO for uploading the PICA uniforms when using hw shaders
|
||||
GLuint hwShaderUniformUBO;
|
||||
|
||||
using StreamBuffer = OpenGLStreamBuffer;
|
||||
std::unique_ptr<StreamBuffer> hwVertexBuffer;
|
||||
std::unique_ptr<StreamBuffer> hwIndexBuffer;
|
||||
|
||||
// Cache of fixed attribute values so that we don't do any duplicate updates
|
||||
std::array<std::array<float, 4>, 16> fixedAttrValues;
|
||||
|
||||
// Cached recompiled fragment shader
|
||||
struct CachedProgram {
|
||||
OpenGL::Program program;
|
||||
};
|
||||
std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
|
||||
|
||||
struct ShaderCache {
|
||||
std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
|
||||
std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
|
||||
|
||||
// Program cache indexed by GLuints for the vertex and fragment shader to use
|
||||
// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
|
||||
std::unordered_map<u64, CachedProgram> programCache;
|
||||
|
||||
void clear() {
|
||||
for (auto& it : programCache) {
|
||||
CachedProgram& cachedProgram = it.second;
|
||||
cachedProgram.program.free();
|
||||
}
|
||||
|
||||
for (auto& it : vertexShaderCache) {
|
||||
if (it.second.has_value()) {
|
||||
it.second->free();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& it : fragmentShaderCache) {
|
||||
it.second.free();
|
||||
}
|
||||
|
||||
programCache.clear();
|
||||
vertexShaderCache.clear();
|
||||
fragmentShaderCache.clear();
|
||||
}
|
||||
};
|
||||
ShaderCache shaderCache;
|
||||
|
||||
OpenGL::Framebuffer getColourFBO();
|
||||
OpenGL::Texture getTexture(Texture& tex);
|
||||
|
@ -95,6 +156,8 @@ class RendererGL final : public Renderer {
|
|||
void updateFogLUT();
|
||||
void initGraphicsContextInternal();
|
||||
|
||||
void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
|
||||
|
||||
public:
|
||||
RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
|
||||
: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
|
||||
|
@ -112,15 +175,13 @@ class RendererGL final : public Renderer {
|
|||
virtual bool supportsShaderReload() override { return true; }
|
||||
virtual std::string getUbershader() override;
|
||||
virtual void setUbershader(const std::string& shader) override;
|
||||
|
||||
virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
|
||||
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
|
||||
|
||||
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
|
||||
|
||||
// Note: The caller is responsible for deleting the currently bound FBO before calling this
|
||||
void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
|
||||
void resetStateManager() { gl.reset(); }
|
||||
void clearShaderCache();
|
||||
void initUbershader(OpenGL::Program& program);
|
||||
|
||||
#ifdef PANDA3DS_FRONTEND_QT
|
||||
|
|
|
@ -67,6 +67,7 @@ void EmulatorConfig::load() {
|
|||
vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
|
||||
useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
|
||||
accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
|
||||
accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);
|
||||
|
||||
forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
|
||||
lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
|
||||
|
@ -141,6 +142,7 @@ void EmulatorConfig::save() {
|
|||
data["GPU"]["UseUbershaders"] = useUbershaders;
|
||||
data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
|
||||
data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
|
||||
data["GPU"]["AccelerateShaders"] = accelerateShaders;
|
||||
data["GPU"]["EnableRenderdoc"] = enableRenderdoc;
|
||||
|
||||
data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
|
||||
|
|
148
src/core/PICA/draw_acceleration.cpp
Normal file
148
src/core/PICA/draw_acceleration.cpp
Normal file
|
@ -0,0 +1,148 @@
|
|||
#include "PICA/draw_acceleration.hpp"
|
||||
|
||||
#include <bit>
|
||||
#include <limits>
|
||||
|
||||
#include "PICA/gpu.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
|
||||
void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
||||
accel.indexed = indexed;
|
||||
accel.totalAttribCount = totalAttribCount;
|
||||
accel.enabledAttributeMask = 0;
|
||||
|
||||
const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
|
||||
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer
|
||||
|
||||
if (indexed) {
|
||||
u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
|
||||
u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
|
||||
|
||||
u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
|
||||
u16 minimumIndex = std::numeric_limits<u16>::max();
|
||||
u16 maximumIndex = 0;
|
||||
|
||||
// Check whether the index buffer uses u16 indices or u8
|
||||
accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit
|
||||
|
||||
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
|
||||
if (accel.useShortIndices) {
|
||||
u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
|
||||
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
u16 index = indexBuffer16[i];
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < vertexCount; i++) {
|
||||
u16 index = u16(indexBuffer[i]);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
}
|
||||
|
||||
accel.indexBuffer = indexBuffer;
|
||||
accel.minimumIndex = minimumIndex;
|
||||
accel.maximumIndex = maximumIndex;
|
||||
} else {
|
||||
accel.indexBuffer = nullptr;
|
||||
accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
|
||||
accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
|
||||
}
|
||||
|
||||
const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
|
||||
const u64 inputAttrCfg = getVertexShaderInputConfig();
|
||||
|
||||
u32 attrCount = 0;
|
||||
u32 loaderOffset = 0;
|
||||
accel.vertexDataSize = 0;
|
||||
accel.totalLoaderCount = 0;
|
||||
|
||||
for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
|
||||
auto& loaderData = attributeInfo[i]; // Get information for this attribute loader
|
||||
|
||||
// This loader is empty, skip it
|
||||
if (loaderData.componentCount == 0 || loaderData.size == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto& loader = accel.loaders[accel.totalLoaderCount++];
|
||||
|
||||
// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
|
||||
// Which is equal to maximumIndex - minimumIndex + 1
|
||||
const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
|
||||
loader.size = bytes;
|
||||
|
||||
// Add it to the total vertex data size, aligned to 4 bytes.
|
||||
accel.vertexDataSize += (bytes + 3) & ~3;
|
||||
|
||||
// Get a pointer to the data where this loader's data is stored
|
||||
const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
|
||||
loader.data = getPointerPhys<u8>(loaderAddress);
|
||||
|
||||
u64 attrCfg = loaderData.getConfigFull(); // Get config1 | (config2 << 32)
|
||||
u32 attributeOffset = 0;
|
||||
|
||||
for (int component = 0; component < loaderData.componentCount; component++) {
|
||||
uint attributeIndex = (attrCfg >> (component * 4)) & 0xf; // Get index of attribute in vertexCfg
|
||||
|
||||
// Vertex attributes used as padding
|
||||
// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
|
||||
if (attributeIndex >= 12) [[unlikely]] {
|
||||
// Align attribute address up to a 4 byte boundary
|
||||
attributeOffset = (attributeOffset + 3) & -4;
|
||||
attributeOffset += (attributeIndex - 11) << 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
|
||||
const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float)
|
||||
const u32 size = (attribInfo >> 2) + 1; // Total number of components
|
||||
|
||||
// Size of each component based on the attribute type
|
||||
static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
|
||||
const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
|
||||
// Mark the attribute as enabled
|
||||
accel.enabledAttributeMask |= 1 << inputReg;
|
||||
|
||||
auto& attr = accel.attributeInfo[inputReg];
|
||||
attr.componentCount = size;
|
||||
attr.offset = attributeOffset + loaderOffset;
|
||||
attr.stride = loaderData.size;
|
||||
attr.type = attribType;
|
||||
attributeOffset += size * sizePerComponent[attribType];
|
||||
}
|
||||
|
||||
loaderOffset += loader.size;
|
||||
}
|
||||
|
||||
u32 fixedAttributes = fixedAttribMask;
|
||||
accel.fixedAttributes = 0;
|
||||
|
||||
// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
|
||||
while (fixedAttributes != 0) {
|
||||
// Get index of next fixed attribute and turn it off
|
||||
const u32 index = std::countr_zero<u32>(fixedAttributes);
|
||||
const u32 mask = 1u << index;
|
||||
fixedAttributes ^= mask;
|
||||
|
||||
// PICA register this fixed attribute is meant to go to
|
||||
const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
|
||||
const u32 inputRegMask = 1u << inputReg;
|
||||
|
||||
// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
|
||||
if ((accel.enabledAttributeMask & inputRegMask) == 0) {
|
||||
vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
|
||||
auto& attr = accel.attributeInfo[inputReg];
|
||||
|
||||
accel.fixedAttributes |= inputRegMask;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
attr.fixedValue[i] = fixedAttr[i].toFloat32();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
accel.canBeAccelerated = true;
|
||||
}
|
|
@ -117,37 +117,62 @@ void GPU::reset() {
|
|||
externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
|
||||
externalRegs[Framebuffer1Select] = 0;
|
||||
|
||||
renderer->setUbershaderSetting(config.useUbershaders);
|
||||
renderer->reset();
|
||||
}
|
||||
|
||||
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
|
||||
// And whether we are going to use the shader JIT (second template parameter)
|
||||
void GPU::drawArrays(bool indexed) {
|
||||
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
|
||||
|
||||
if (indexed) {
|
||||
if (shaderJITEnabled)
|
||||
drawArrays<true, true>();
|
||||
else
|
||||
drawArrays<true, false>();
|
||||
} else {
|
||||
if (shaderJITEnabled)
|
||||
drawArrays<false, true>();
|
||||
else
|
||||
drawArrays<false, false>();
|
||||
}
|
||||
}
|
||||
|
||||
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
|
||||
|
||||
template <bool indexed, bool useShaderJIT>
|
||||
void GPU::drawArrays() {
|
||||
if constexpr (useShaderJIT) {
|
||||
shaderJIT.prepare(shaderUnit.vs);
|
||||
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
|
||||
// And whether we are going to use the shader JIT (second template parameter)
|
||||
void GPU::drawArrays(bool indexed) {
|
||||
PICA::DrawAcceleration accel;
|
||||
|
||||
if (config.accelerateShaders) {
|
||||
// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
|
||||
// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on
|
||||
getAcceleratedDrawInfo(accel, indexed);
|
||||
}
|
||||
|
||||
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
|
||||
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
|
||||
|
||||
if (hwShaders) {
|
||||
// Hardware shaders have their own accelerated code path for draws, so they skip everything here
|
||||
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
|
||||
// Total # of vertices to render
|
||||
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
|
||||
|
||||
// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
|
||||
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
|
||||
} else {
|
||||
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
|
||||
|
||||
if (indexed) {
|
||||
if (shaderJITEnabled) {
|
||||
drawArrays<true, ShaderExecMode::JIT>();
|
||||
} else {
|
||||
drawArrays<true, ShaderExecMode::Interpreter>();
|
||||
}
|
||||
} else {
|
||||
if (shaderJITEnabled) {
|
||||
drawArrays<false, ShaderExecMode::JIT>();
|
||||
} else {
|
||||
drawArrays<false, ShaderExecMode::Interpreter>();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool indexed, ShaderExecMode mode>
|
||||
void GPU::drawArrays() {
|
||||
if constexpr (mode == ShaderExecMode::JIT) {
|
||||
shaderJIT.prepare(shaderUnit.vs);
|
||||
} else if constexpr (mode == ShaderExecMode::Hardware) {
|
||||
// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
|
||||
Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
|
||||
}
|
||||
|
||||
// We can have up to 16 attributes, each one consisting of 4 floats
|
||||
constexpr u32 maxAttrSizeInFloats = 16 * 4;
|
||||
|
||||
// Base address for vertex attributes
|
||||
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
|
||||
|
@ -312,8 +337,6 @@ void GPU::drawArrays() {
|
|||
}
|
||||
|
||||
// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
|
||||
// Corgi does this although I'm not sure if it's actually needed for anything.
|
||||
// TODO: Find out
|
||||
while (component < 4) {
|
||||
attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
|
||||
component++;
|
||||
|
@ -327,13 +350,13 @@ void GPU::drawArrays() {
|
|||
|
||||
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
|
||||
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
|
||||
// Ie it might attribute #0 to v2, #1 to v7, etc
|
||||
// Ie it might map attribute #0 to v2, #1 to v7, etc
|
||||
for (int j = 0; j < totalAttribCount; j++) {
|
||||
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
|
||||
std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f));
|
||||
}
|
||||
|
||||
if constexpr (useShaderJIT) {
|
||||
if constexpr (mode == ShaderExecMode::JIT) {
|
||||
shaderJIT.run(shaderUnit.vs);
|
||||
} else {
|
||||
shaderUnit.vs.run();
|
||||
|
|
|
@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
|
|||
// If we've reached 3 verts, issue a draw call
|
||||
// Handle rendering depending on the primitive type
|
||||
if (immediateModeVertIndex == 3) {
|
||||
renderer->prepareForDraw(shaderUnit, nullptr);
|
||||
renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
|
||||
|
||||
switch (primType) {
|
||||
|
@ -300,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
|
|||
}
|
||||
|
||||
case VertexBoolUniform: {
|
||||
shaderUnit.vs.boolUniform = value & 0xffff;
|
||||
shaderUnit.vs.uploadBoolUniform(value & 0xffff);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
#include "PICA/shader_decompiler.hpp"
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
using namespace PICA;
|
||||
|
@ -13,11 +18,45 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
|
|||
analysisFailed = false;
|
||||
|
||||
const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
|
||||
if (function == nullptr) {
|
||||
if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
|
||||
analysisFailed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Helpers for merging parallel/series exit methods from Citra
|
||||
// Merges exit method of two parallel branches.
|
||||
static ExitMode exitParallel(ExitMode a, ExitMode b) {
|
||||
if (a == ExitMode::Unknown) {
|
||||
return b;
|
||||
}
|
||||
else if (b == ExitMode::Unknown) {
|
||||
return a;
|
||||
}
|
||||
else if (a == b) {
|
||||
return a;
|
||||
}
|
||||
return ExitMode::Conditional;
|
||||
}
|
||||
|
||||
// Cascades exit method of two blocks of code.
|
||||
static ExitMode exitSeries(ExitMode a, ExitMode b) {
|
||||
assert(a != ExitMode::AlwaysEnd);
|
||||
|
||||
if (a == ExitMode::Unknown) {
|
||||
return ExitMode::Unknown;
|
||||
}
|
||||
|
||||
if (a == ExitMode::AlwaysReturn) {
|
||||
return b;
|
||||
}
|
||||
|
||||
if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
|
||||
return ExitMode::AlwaysEnd;
|
||||
}
|
||||
|
||||
return ExitMode::Conditional;
|
||||
}
|
||||
|
||||
ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
|
||||
// Initialize exit mode to unknown by default, in order to detect things like unending loops
|
||||
auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
|
||||
|
@ -32,25 +71,132 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
|
|||
const u32 opcode = instruction >> 26;
|
||||
|
||||
switch (opcode) {
|
||||
case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
|
||||
case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
|
||||
case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
|
||||
case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
|
||||
case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
|
||||
case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
|
||||
case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
|
||||
case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
|
||||
case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
|
||||
case ShaderOpcodes::JMPC:
|
||||
case ShaderOpcodes::JMPU: {
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
// Register this jump address to our outLabels set
|
||||
labels.insert(dest);
|
||||
|
||||
// This opens up 2 parallel paths of execution
|
||||
auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
|
||||
auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
|
||||
it->second = exitParallel(branchTakenExit, branchNotTakenExit);
|
||||
return it->second;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::IFU:
|
||||
case ShaderOpcodes::IFC: {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
|
||||
const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
|
||||
// Check if analysis of the branch taken func failed and return unknown if it did
|
||||
if (analysisFailed) {
|
||||
it->second = ExitMode::Unknown;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
// Next analyze the not taken func
|
||||
ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
|
||||
if (num != 0) {
|
||||
const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
|
||||
// Check if analysis failed and return unknown if it did
|
||||
if (analysisFailed) {
|
||||
it->second = ExitMode::Unknown;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
branchNotTakenExitMode = branchNotTakenFunc->exitMode;
|
||||
}
|
||||
|
||||
auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
|
||||
// Both branches of the if/else end, so there's nothing after the call
|
||||
if (parallel == ExitMode::AlwaysEnd) {
|
||||
it->second = parallel;
|
||||
return it->second;
|
||||
} else {
|
||||
ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
|
||||
ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
|
||||
it->second = conditionalExitMode;
|
||||
return it->second;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::CALL: {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const Function* calledFunction = addFunction(shader, dest, dest + num);
|
||||
|
||||
// Check if analysis of the branch taken func failed and return unknown if it did
|
||||
if (analysisFailed) {
|
||||
it->second = ExitMode::Unknown;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
|
||||
it->second = ExitMode::AlwaysEnd;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
// Exit mode of the remainder of this function, after we return from the callee
|
||||
const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
|
||||
const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
|
||||
|
||||
it->second = exitMode;
|
||||
return exitMode;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::CALLC:
|
||||
case ShaderOpcodes::CALLU: {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const Function* calledFunction = addFunction(shader, dest, dest + num);
|
||||
|
||||
// Check if analysis of the branch taken func failed and return unknown if it did
|
||||
if (analysisFailed) {
|
||||
it->second = ExitMode::Unknown;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
// Exit mode of the remainder of this function, after we return from the callee
|
||||
const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
|
||||
const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
|
||||
|
||||
it->second = exitMode;
|
||||
return exitMode;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::LOOP: {
|
||||
u32 dest = getBits<10, 12>(instruction);
|
||||
const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
|
||||
if (analysisFailed) {
|
||||
it->second = ExitMode::Unknown;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
|
||||
it->second = ExitMode::AlwaysEnd;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
|
||||
const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
|
||||
it->second = exitMode;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
// A function without control flow instructions will always reach its "return point" and return
|
||||
return ExitMode::AlwaysReturn;
|
||||
it->second = ExitMode::AlwaysReturn;
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void ShaderDecompiler::compileRange(const AddressRange& range) {
|
||||
std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
|
||||
u32 pc = range.start;
|
||||
const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
|
||||
bool finished = false;
|
||||
|
@ -58,6 +204,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
|
|||
while (pc < end && !finished) {
|
||||
compileInstruction(pc, finished);
|
||||
}
|
||||
|
||||
return std::make_pair(pc, finished);
|
||||
}
|
||||
|
||||
const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
|
||||
|
@ -71,20 +219,43 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
|
|||
}
|
||||
|
||||
void ShaderDecompiler::writeAttributes() {
|
||||
// Annoyingly, GLES does not support having an array as an input attribute, so declare each attribute separately for now
|
||||
decompiledShader += R"(
|
||||
layout(location = 0) in vec4 inputs[8];
|
||||
layout(location = 0) in vec4 attr0;
|
||||
layout(location = 1) in vec4 attr1;
|
||||
layout(location = 2) in vec4 attr2;
|
||||
layout(location = 3) in vec4 attr3;
|
||||
layout(location = 4) in vec4 attr4;
|
||||
layout(location = 5) in vec4 attr5;
|
||||
layout(location = 6) in vec4 attr6;
|
||||
layout(location = 7) in vec4 attr7;
|
||||
layout(location = 8) in vec4 attr8;
|
||||
layout(location = 9) in vec4 attr9;
|
||||
layout(location = 10) in vec4 attr10;
|
||||
layout(location = 11) in vec4 attr11;
|
||||
layout(location = 12) in vec4 attr12;
|
||||
layout(location = 13) in vec4 attr13;
|
||||
layout(location = 14) in vec4 attr14;
|
||||
layout(location = 15) in vec4 attr15;
|
||||
|
||||
layout(std140) uniform PICAShaderUniforms {
|
||||
vec4 uniform_float[96];
|
||||
uvec4 uniform_int;
|
||||
uint uniform_bool;
|
||||
};
|
||||
|
||||
vec4 temp_registers[16];
|
||||
vec4 dummy_vec = vec4(0.0);
|
||||
layout(std140) uniform PICAShaderUniforms {
|
||||
vec4 uniform_f[96];
|
||||
uvec4 uniform_i;
|
||||
uint uniform_bool;
|
||||
};
|
||||
|
||||
vec4 temp[16];
|
||||
vec4 out_regs[16];
|
||||
vec4 dummy_vec = vec4(0.0);
|
||||
ivec3 addr_reg = ivec3(0);
|
||||
bvec2 cmp_reg = bvec2(false);
|
||||
|
||||
vec4 uniform_indexed(int source, int offset) {
|
||||
int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
|
||||
uint index = uint(clipped_offs + source) & 127u;
|
||||
return (index < 96u) ? uniform_f[index] : vec4(1.0);
|
||||
}
|
||||
)";
|
||||
|
||||
decompiledShader += "\n";
|
||||
}
|
||||
|
||||
std::string ShaderDecompiler::decompile() {
|
||||
|
@ -94,11 +265,14 @@ std::string ShaderDecompiler::decompile() {
|
|||
return "";
|
||||
}
|
||||
|
||||
decompiledShader = "";
|
||||
compilationError = false;
|
||||
decompiledShader.clear();
|
||||
// Reserve some memory for the shader string to avoid memory allocations
|
||||
decompiledShader.reserve(256 * 1024);
|
||||
|
||||
switch (api) {
|
||||
case API::GL: decompiledShader += "#version 410 core\n"; break;
|
||||
case API::GLES: decompiledShader += "#version 300 es\n"; break;
|
||||
case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
|
@ -109,7 +283,7 @@ std::string ShaderDecompiler::decompile() {
|
|||
decompiledShader += R"(
|
||||
vec4 safe_mul(vec4 a, vec4 b) {
|
||||
vec4 res = a * b;
|
||||
return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
|
||||
return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
@ -121,17 +295,61 @@ std::string ShaderDecompiler::decompile() {
|
|||
|
||||
decompiledShader += "void pica_shader_main() {\n";
|
||||
AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
|
||||
callFunction(*findFunction(mainFunctionRange));
|
||||
decompiledShader += "}\n";
|
||||
auto mainFunc = findFunction(mainFunctionRange);
|
||||
|
||||
for (auto& func : controlFlow.functions) {
|
||||
if (func.outLabels.size() > 0) {
|
||||
Helpers::panic("Function with out labels");
|
||||
decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
|
||||
|
||||
for (const Function& func : controlFlow.functions) {
|
||||
if (func.outLabels.empty()) {
|
||||
decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
|
||||
|
||||
auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
|
||||
if (!finished) {
|
||||
decompiledShader += "return false;";
|
||||
}
|
||||
|
||||
decompiledShader += "}\n";
|
||||
} else {
|
||||
auto labels = func.outLabels;
|
||||
labels.insert(func.start);
|
||||
|
||||
// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
|
||||
// current PC
|
||||
decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
|
||||
decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
|
||||
decompiledShader += "while(true){\nswitch(pc){\n";
|
||||
|
||||
for (u32 label : labels) {
|
||||
decompiledShader += fmt::format("case {}u: {{", label);
|
||||
// Fetch the next label whose address > label
|
||||
auto it = labels.lower_bound(label + 1);
|
||||
u32 next = (it == labels.end()) ? func.end : *it;
|
||||
|
||||
auto [endPC, finished] = compileRange(AddressRange(label, next));
|
||||
if (endPC > next && !finished) {
|
||||
labels.insert(endPC);
|
||||
decompiledShader += fmt::format("pc = {}u; break;", endPC);
|
||||
}
|
||||
|
||||
// Fallthrough to next label
|
||||
decompiledShader += "}\n";
|
||||
}
|
||||
|
||||
decompiledShader += "default: return false;\n";
|
||||
// Exit the switch and loop
|
||||
decompiledShader += "} }\n";
|
||||
|
||||
// Exit the function
|
||||
decompiledShader += "return false;\n";
|
||||
decompiledShader += "}\n";
|
||||
}
|
||||
}
|
||||
|
||||
decompiledShader += "void " + func.getIdentifier() + "() {\n";
|
||||
compileRange(AddressRange(func.start, func.end));
|
||||
decompiledShader += "}\n";
|
||||
// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
|
||||
// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
|
||||
// and the renderer core will decide to use CPU shaders instead
|
||||
if (compilationError) [[unlikely]] {
|
||||
return "";
|
||||
}
|
||||
|
||||
return decompiledShader;
|
||||
|
@ -139,30 +357,41 @@ std::string ShaderDecompiler::decompile() {
|
|||
|
||||
std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
|
||||
if (source < 0x10) {
|
||||
return "inputs[" + std::to_string(source) + "]";
|
||||
return "attr" + std::to_string(source);
|
||||
} else if (source < 0x20) {
|
||||
return "temp_registers[" + std::to_string(source - 0x10) + "]";
|
||||
return "temp[" + std::to_string(source - 0x10) + "]";
|
||||
} else {
|
||||
const usize floatIndex = (source - 0x20) & 0x7f;
|
||||
|
||||
if (floatIndex >= 96) [[unlikely]] {
|
||||
return "dummy_vec";
|
||||
if (index == 0) {
|
||||
if (floatIndex >= 96) [[unlikely]] {
|
||||
return "dummy_vec";
|
||||
}
|
||||
return "uniform_f[" + std::to_string(floatIndex) + "]";
|
||||
} else {
|
||||
static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
|
||||
return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
|
||||
}
|
||||
return "uniform_float[" + std::to_string(floatIndex) + "]";
|
||||
}
|
||||
}
|
||||
|
||||
std::string ShaderDecompiler::getDest(u32 dest) const {
|
||||
if (dest < 0x10) {
|
||||
return "output_registers[" + std::to_string(dest) + "]";
|
||||
return "out_regs[" + std::to_string(dest) + "]";
|
||||
} else if (dest < 0x20) {
|
||||
return "temp_registers[" + std::to_string(dest - 0x10) + "]";
|
||||
return "temp[" + std::to_string(dest - 0x10) + "]";
|
||||
} else {
|
||||
return "dummy_vec";
|
||||
}
|
||||
}
|
||||
|
||||
std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
|
||||
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
|
||||
static constexpr uint noSwizzle = 0x1B;
|
||||
if (swizzle == noSwizzle) {
|
||||
return "";
|
||||
}
|
||||
|
||||
static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
|
||||
std::string ret(". ");
|
||||
|
||||
|
@ -176,7 +405,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
|
|||
|
||||
std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
|
||||
std::string ret = ".";
|
||||
|
||||
if (destinationMask & 0b1000) {
|
||||
ret += "x";
|
||||
}
|
||||
|
@ -208,11 +436,12 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
|
|||
return;
|
||||
}
|
||||
|
||||
decompiledShader += dest + destSwizzle + " = ";
|
||||
if (writtenLaneCount == 1) {
|
||||
decompiledShader += "float(" + value + ");\n";
|
||||
} else {
|
||||
decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
|
||||
// Don't write destination swizzle if all lanes are getting written to
|
||||
decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
|
||||
if (writtenLaneCount <= 3) {
|
||||
decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
|
||||
} else if (writtenLaneCount == 4) {
|
||||
decompiledShader += fmt::format("{};\n", value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -246,26 +475,101 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
|
|||
|
||||
std::string dest = getDest(destIndex);
|
||||
|
||||
if (idx != 0) {
|
||||
Helpers::panic("GLSL recompiler: Indexed instruction");
|
||||
}
|
||||
|
||||
if (invertSources) {
|
||||
Helpers::panic("GLSL recompiler: Inverted instruction");
|
||||
}
|
||||
|
||||
switch (opcode) {
|
||||
case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
|
||||
case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
|
||||
case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
|
||||
case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
|
||||
case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
|
||||
case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
|
||||
case ShaderOpcodes::MUL:
|
||||
if (!config.accurateShaderMul) {
|
||||
setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
|
||||
} else {
|
||||
setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
|
||||
}
|
||||
break;
|
||||
case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
|
||||
case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;
|
||||
|
||||
case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
|
||||
case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
|
||||
case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
|
||||
case ShaderOpcodes::DP3:
|
||||
if (!config.accurateShaderMul) {
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
|
||||
} else {
|
||||
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
|
||||
}
|
||||
break;
|
||||
case ShaderOpcodes::DP4:
|
||||
if (!config.accurateShaderMul) {
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
|
||||
} else {
|
||||
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
|
||||
}
|
||||
break;
|
||||
case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
|
||||
case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
|
||||
case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
|
||||
case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
|
||||
case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;
|
||||
|
||||
default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
|
||||
case ShaderOpcodes::SLT:
|
||||
case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
|
||||
|
||||
case ShaderOpcodes::SGE:
|
||||
case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
|
||||
|
||||
case ShaderOpcodes::DPH:
|
||||
case ShaderOpcodes::DPHI:
|
||||
if (!config.accurateShaderMul) {
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
|
||||
} else {
|
||||
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
|
||||
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
|
||||
}
|
||||
break;
|
||||
|
||||
case ShaderOpcodes::CMP1:
|
||||
case ShaderOpcodes::CMP2: {
|
||||
static constexpr std::array<const char*, 8> operators = {
|
||||
// The last 2 operators always return true and are handled specially
|
||||
"==", "!=", "<", "<=", ">", ">=", "", "",
|
||||
};
|
||||
|
||||
const u32 cmpY = getBits<21, 3>(instruction);
|
||||
const u32 cmpX = getBits<24, 3>(instruction);
|
||||
|
||||
// Compare x first
|
||||
if (cmpX >= 6) {
|
||||
decompiledShader += "cmp_reg.x = true;\n";
|
||||
} else {
|
||||
decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
|
||||
}
|
||||
|
||||
// Then compare Y
|
||||
if (cmpY >= 6) {
|
||||
decompiledShader += "cmp_reg.y = true;\n";
|
||||
} else {
|
||||
decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::MOVA: {
|
||||
const bool writeX = getBit<3>(operandDescriptor); // Should we write the x component of the address register?
|
||||
const bool writeY = getBit<2>(operandDescriptor);
|
||||
|
||||
if (writeX && writeY) {
|
||||
decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
|
||||
} else if (writeX) {
|
||||
decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
|
||||
} else if (writeY) {
|
||||
decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
|
||||
compilationError = true;
|
||||
break;
|
||||
}
|
||||
} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
|
||||
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
|
||||
|
@ -299,23 +603,156 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
|
|||
src3 += getSwizzlePattern(swizzle3);
|
||||
|
||||
std::string dest = getDest(destIndex);
|
||||
|
||||
if (idx != 0) {
|
||||
Helpers::panic("GLSL recompiler: Indexed instruction");
|
||||
if (!config.accurateShaderMul) {
|
||||
setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
|
||||
} else {
|
||||
setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
|
||||
}
|
||||
|
||||
setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
|
||||
} else {
|
||||
switch (opcode) {
|
||||
case ShaderOpcodes::END: finished = true; return;
|
||||
default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
|
||||
case ShaderOpcodes::JMPC: {
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const u32 condOp = getBits<22, 2>(instruction);
|
||||
const uint refY = getBit<24>(instruction);
|
||||
const uint refX = getBit<25>(instruction);
|
||||
const char* condition = getCondition(condOp, refX, refY);
|
||||
|
||||
decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::JMPU: {
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
|
||||
const u32 mask = 1u << bit;
|
||||
const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we jump if bit = 1, otherwise 0
|
||||
|
||||
decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::IFU:
|
||||
case ShaderOpcodes::IFC: {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
|
||||
|
||||
if (opcode == ShaderOpcodes::IFC) {
|
||||
const u32 condOp = getBits<22, 2>(instruction);
|
||||
const uint refY = getBit<24>(instruction);
|
||||
const uint refX = getBit<25>(instruction);
|
||||
const char* condition = getCondition(condOp, refX, refY);
|
||||
|
||||
decompiledShader += fmt::format("if ({}) {{", condition);
|
||||
} else {
|
||||
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
|
||||
const u32 mask = 1u << bit;
|
||||
|
||||
decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
|
||||
}
|
||||
|
||||
callFunction(*conditionalFunc);
|
||||
decompiledShader += "}\n";
|
||||
|
||||
pc = dest;
|
||||
if (num > 0) {
|
||||
const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
|
||||
pc = dest + num;
|
||||
|
||||
decompiledShader += "else { ";
|
||||
callFunction(*elseFunc);
|
||||
decompiledShader += "}\n";
|
||||
|
||||
if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
|
||||
finished = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::CALL:
|
||||
case ShaderOpcodes::CALLC:
|
||||
case ShaderOpcodes::CALLU: {
|
||||
const u32 num = instruction & 0xff;
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
|
||||
|
||||
// Handle conditions for CALLC/CALLU
|
||||
if (opcode == ShaderOpcodes::CALLC) {
|
||||
const u32 condOp = getBits<22, 2>(instruction);
|
||||
const uint refY = getBit<24>(instruction);
|
||||
const uint refX = getBit<25>(instruction);
|
||||
const char* condition = getCondition(condOp, refX, refY);
|
||||
|
||||
decompiledShader += fmt::format("if ({}) {{", condition);
|
||||
} else if (opcode == ShaderOpcodes::CALLU) {
|
||||
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
|
||||
const u32 mask = 1u << bit;
|
||||
|
||||
decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
|
||||
}
|
||||
|
||||
callFunction(*calledFunc);
|
||||
|
||||
// Close brackets for CALLC/CALLU
|
||||
if (opcode != ShaderOpcodes::CALL) {
|
||||
decompiledShader += "}";
|
||||
}
|
||||
|
||||
if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
|
||||
finished = true;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::LOOP: {
|
||||
const u32 dest = getBits<10, 12>(instruction);
|
||||
const u32 uniformIndex = getBits<22, 2>(instruction);
|
||||
|
||||
// loop counter = uniform.y
|
||||
decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
|
||||
decompiledShader += fmt::format(
|
||||
"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
|
||||
"16u) & 0xFFu)) {{\n",
|
||||
pc, pc, uniformIndex, pc, uniformIndex
|
||||
);
|
||||
|
||||
AddressRange range(pc + 1, dest + 1);
|
||||
const Function* func = findFunction(range);
|
||||
callFunction(*func);
|
||||
decompiledShader += "}\n";
|
||||
|
||||
// Jump to the end of the loop. We don't want to compile the code inside the loop again.
|
||||
// This will be incremented by 1 due to the pc++ at the end of this loop.
|
||||
pc = dest;
|
||||
|
||||
if (func->exitMode == ExitMode::AlwaysEnd) {
|
||||
finished = true;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case ShaderOpcodes::END:
|
||||
decompiledShader += "return true;\n";
|
||||
finished = true;
|
||||
return;
|
||||
|
||||
case ShaderOpcodes::NOP: break;
|
||||
|
||||
default:
|
||||
Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
|
||||
compilationError = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pc++;
|
||||
}
|
||||
|
||||
|
||||
bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
|
||||
const u32 opcode = instruction >> 26;
|
||||
switch (opcode) {
|
||||
|
@ -339,16 +776,57 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
|
|||
case ShaderOpcodes::SLT:
|
||||
case ShaderOpcodes::SLTI:
|
||||
case ShaderOpcodes::SGE:
|
||||
case ShaderOpcodes::SGEI: return true;
|
||||
case ShaderOpcodes::SGEI:
|
||||
case ShaderOpcodes::LITP: return true;
|
||||
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
|
||||
void ShaderDecompiler::callFunction(const Function& function) {
|
||||
switch (function.exitMode) {
|
||||
// This function always ends, so call it and return true to signal that we're gonna be ending the shader
|
||||
case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
|
||||
// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
|
||||
case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
|
||||
// This function will not end. Just call it like a normal function.
|
||||
default: decompiledShader += function.getCallStatement() + ";\n"; break;
|
||||
}
|
||||
}
|
||||
|
||||
std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
|
||||
ShaderDecompiler decompiler(shader, config, entrypoint, api, language);
|
||||
|
||||
return decompiler.decompile();
|
||||
}
|
||||
|
||||
const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
|
||||
static constexpr std::array<const char*, 16> conditions = {
|
||||
// ref(Y, X) = (0, 0)
|
||||
"!all(cmp_reg)",
|
||||
"all(not(cmp_reg))",
|
||||
"!cmp_reg.x",
|
||||
"!cmp_reg.y",
|
||||
|
||||
// ref(Y, X) = (0, 1)
|
||||
"cmp_reg.x || !cmp_reg.y",
|
||||
"cmp_reg.x && !cmp_reg.y",
|
||||
"cmp_reg.x",
|
||||
"!cmp_reg.y",
|
||||
|
||||
// ref(Y, X) = (1, 0)
|
||||
"!cmp_reg.x || cmp_reg.y",
|
||||
"!cmp_reg.x && cmp_reg.y",
|
||||
"!cmp_reg.x",
|
||||
"cmp_reg.y",
|
||||
|
||||
// ref(Y, X) = (1, 1)
|
||||
"any(cmp_reg)",
|
||||
"all(cmp_reg)",
|
||||
"cmp_reg.x",
|
||||
"cmp_reg.y",
|
||||
};
|
||||
const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
|
||||
|
||||
return conditions[key];
|
||||
}
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
#include <fmt/format.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen.hpp"
|
||||
|
@ -702,6 +706,113 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
|
|||
shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
|
||||
}
|
||||
|
||||
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
|
||||
// First, calculate output register -> Fixed function fragment semantics based on the VAO config
|
||||
// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
|
||||
// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
|
||||
std::array<std::pair<int, int>, 32> outputMappings{};
|
||||
// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
|
||||
std::array<u8, 16> vsOutputRegisters;
|
||||
|
||||
{
|
||||
uint count = 0;
|
||||
u16 outputMask = vertConfig.outputMask;
|
||||
|
||||
// See which registers are actually enabled and ignore the disabled ones
|
||||
for (int i = 0; i < 16; i++) {
|
||||
if (outputMask & 1) {
|
||||
vsOutputRegisters[count++] = i;
|
||||
}
|
||||
|
||||
outputMask >>= 1;
|
||||
}
|
||||
|
||||
// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
|
||||
for (; count < 16; count++) {
|
||||
vsOutputRegisters[count] = count;
|
||||
}
|
||||
|
||||
for (int i = 0; i < vertConfig.outputCount; i++) {
|
||||
const u32 config = vertConfig.outmaps[i];
|
||||
for (int j = 0; j < 4; j++) {
|
||||
const u32 mapping = (config >> (j * 8)) & 0x1F;
|
||||
outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto getSemanticName = [&](u32 semanticIndex) {
|
||||
auto [reg, lane] = outputMappings[semanticIndex];
|
||||
return fmt::format("out_regs[{}][{}]", reg, lane);
|
||||
};
|
||||
|
||||
std::string semantics = fmt::format(
|
||||
R"(
|
||||
vec4 a_coords = vec4({}, {}, {}, {});
|
||||
vec4 a_quaternion = vec4({}, {}, {}, {});
|
||||
vec4 a_vertexColour = vec4({}, {}, {}, {});
|
||||
vec2 a_texcoord0 = vec2({}, {});
|
||||
float a_texcoord0_w = {};
|
||||
vec2 a_texcoord1 = vec2({}, {});
|
||||
vec2 a_texcoord2 = vec2({}, {});
|
||||
vec3 a_view = vec3({}, {}, {});
|
||||
)",
|
||||
getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
|
||||
getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
|
||||
getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
|
||||
getSemanticName(18), getSemanticName(19), getSemanticName(20)
|
||||
);
|
||||
|
||||
if (usingUbershader) {
|
||||
Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
|
||||
return picaSource;
|
||||
} else {
|
||||
// TODO: Uniforms and don't hardcode fixed-function semantic indices...
|
||||
std::string ret = picaSource;
|
||||
if (api == API::GLES) {
|
||||
ret += "\n#define USING_GLES\n";
|
||||
}
|
||||
|
||||
ret += uniformDefinition;
|
||||
|
||||
ret += R"(
|
||||
out vec4 v_quaternion;
|
||||
out vec4 v_colour;
|
||||
out vec3 v_texcoord0;
|
||||
out vec2 v_texcoord1;
|
||||
out vec3 v_view;
|
||||
out vec2 v_texcoord2;
|
||||
|
||||
#ifndef USING_GLES
|
||||
out float gl_ClipDistance[2];
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
pica_shader_main();
|
||||
)";
|
||||
// Transfer fixed function fragment registers from vertex shader output to the fragment shader
|
||||
ret += semantics;
|
||||
|
||||
ret += R"(
|
||||
gl_Position = a_coords;
|
||||
vec4 colourAbs = abs(a_vertexColour);
|
||||
v_colour = min(colourAbs, vec4(1.f));
|
||||
|
||||
v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
|
||||
v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
|
||||
v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
|
||||
v_view = a_view;
|
||||
v_quaternion = a_quaternion;
|
||||
|
||||
#ifndef USING_GLES
|
||||
gl_ClipDistance[0] = -a_coords.z;
|
||||
gl_ClipDistance[1] = dot(clipCoords, a_coords);
|
||||
#endif
|
||||
})";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
|
||||
if (api != API::GLES) [[unlikely]] {
|
||||
Helpers::warn("Shadergen: Unsupported API for compileLogicOps");
|
||||
|
|
|
@ -34,4 +34,5 @@ void PICAShader::reset() {
|
|||
|
||||
codeHashDirty = true;
|
||||
opdescHashDirty = true;
|
||||
uniformsDirty = true;
|
||||
}
|
|
@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
|
|||
}
|
||||
|
||||
void GLStateManager::resetBuffers() {
|
||||
boundVBO = 0;
|
||||
boundUBO = 0;
|
||||
|
||||
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
||||
glBindBuffer(GL_UNIFORM_BUFFER, 0);
|
||||
}
|
||||
|
||||
|
|
|
@ -2,13 +2,15 @@
|
|||
|
||||
#include <stb_image_write.h>
|
||||
|
||||
#include <bit>
|
||||
#include <cmrc/cmrc.hpp>
|
||||
|
||||
#include "config.hpp"
|
||||
#include "PICA/float_types.hpp"
|
||||
#include "PICA/pica_frag_uniforms.hpp"
|
||||
#include "PICA/gpu.hpp"
|
||||
#include "PICA/pica_frag_uniforms.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_decompiler.hpp"
|
||||
#include "config.hpp"
|
||||
#include "math_util.hpp"
|
||||
|
||||
CMRC_DECLARE(RendererGL);
|
||||
|
@ -24,7 +26,7 @@ void RendererGL::reset() {
|
|||
colourBufferCache.reset();
|
||||
textureCache.reset();
|
||||
|
||||
clearShaderCache();
|
||||
shaderCache.clear();
|
||||
|
||||
// Init the colour/depth buffer settings to some random defaults on reset
|
||||
colourBufferLoc = 0;
|
||||
|
@ -77,40 +79,56 @@ void RendererGL::initGraphicsContextInternal() {
|
|||
gl.useProgram(displayProgram);
|
||||
glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object
|
||||
|
||||
// Create stream buffers for vertex, index and uniform buffers
|
||||
static constexpr usize hwIndexBufferSize = 2_MB;
|
||||
static constexpr usize hwVertexBufferSize = 16_MB;
|
||||
|
||||
hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
|
||||
hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
|
||||
|
||||
// Allocate memory for the shadergen fragment uniform UBO
|
||||
glGenBuffers(1, &shadergenFragmentUBO);
|
||||
gl.bindUBO(shadergenFragmentUBO);
|
||||
glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);
|
||||
|
||||
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
|
||||
gl.bindVBO(vbo);
|
||||
vao.create();
|
||||
gl.bindVAO(vao);
|
||||
// Allocate memory for the accelerated vertex shader uniform UBO
|
||||
glGenBuffers(1, &hwShaderUniformUBO);
|
||||
gl.bindUBO(hwShaderUniformUBO);
|
||||
glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
|
||||
|
||||
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
|
||||
vbo.bind();
|
||||
// Initialize the VAO used when not using hw shaders
|
||||
defaultVAO.create();
|
||||
gl.bindVAO(defaultVAO);
|
||||
|
||||
// Position (x, y, z, w) attributes
|
||||
vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
|
||||
vao.enableAttribute(0);
|
||||
defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
|
||||
defaultVAO.enableAttribute(0);
|
||||
// Quaternion attribute
|
||||
vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
|
||||
vao.enableAttribute(1);
|
||||
defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
|
||||
defaultVAO.enableAttribute(1);
|
||||
// Colour attribute
|
||||
vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
|
||||
vao.enableAttribute(2);
|
||||
defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
|
||||
defaultVAO.enableAttribute(2);
|
||||
// UV 0 attribute
|
||||
vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
|
||||
vao.enableAttribute(3);
|
||||
defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
|
||||
defaultVAO.enableAttribute(3);
|
||||
// UV 1 attribute
|
||||
vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
|
||||
vao.enableAttribute(4);
|
||||
defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
|
||||
defaultVAO.enableAttribute(4);
|
||||
// UV 0 W-component attribute
|
||||
vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
|
||||
vao.enableAttribute(5);
|
||||
defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
|
||||
defaultVAO.enableAttribute(5);
|
||||
// View
|
||||
vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
|
||||
vao.enableAttribute(6);
|
||||
defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
|
||||
defaultVAO.enableAttribute(6);
|
||||
// UV 2 attribute
|
||||
vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
|
||||
vao.enableAttribute(7);
|
||||
defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
|
||||
defaultVAO.enableAttribute(7);
|
||||
|
||||
// Initialize the VAO used for hw shaders
|
||||
hwShaderVAO.create();
|
||||
|
||||
dummyVBO.create();
|
||||
dummyVAO.create();
|
||||
|
@ -165,6 +183,12 @@ void RendererGL::initGraphicsContextInternal() {
|
|||
OpenGL::clearColor();
|
||||
OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);
|
||||
|
||||
// Initialize fixed attributes
|
||||
for (int i = 0; i < fixedAttrValues.size(); i++) {
|
||||
fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
|
||||
glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
|
||||
}
|
||||
|
||||
reset();
|
||||
|
||||
// Populate our driver info structure
|
||||
|
@ -418,29 +442,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
OpenGL::Triangle,
|
||||
};
|
||||
|
||||
bool usingUbershader = enableUbershader;
|
||||
if (usingUbershader) {
|
||||
const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
|
||||
const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
|
||||
|
||||
// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
|
||||
// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
|
||||
if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
|
||||
usingUbershader = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (usingUbershader) {
|
||||
gl.useProgram(triangleProgram);
|
||||
} else {
|
||||
OpenGL::Program& program = getSpecializedShader();
|
||||
gl.useProgram(program);
|
||||
}
|
||||
|
||||
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
|
||||
gl.disableScissor();
|
||||
gl.bindVBO(vbo);
|
||||
gl.bindVAO(vao);
|
||||
|
||||
// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
|
||||
if (!usingAcceleratedShader) {
|
||||
vbo.bind();
|
||||
gl.bindVAO(defaultVAO);
|
||||
}
|
||||
|
||||
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
|
||||
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
|
||||
|
@ -458,38 +467,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
const int depthFunc = getBits<4, 3>(depthControl);
|
||||
const int colourMask = getBits<8, 4>(depthControl);
|
||||
gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
|
||||
|
||||
static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
|
||||
|
||||
// Update ubershader uniforms
|
||||
if (usingUbershader) {
|
||||
const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
|
||||
const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
|
||||
const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
|
||||
|
||||
if (oldDepthScale != depthScale) {
|
||||
oldDepthScale = depthScale;
|
||||
glUniform1f(ubershaderData.depthScaleLoc, depthScale);
|
||||
}
|
||||
|
||||
if (oldDepthOffset != depthOffset) {
|
||||
oldDepthOffset = depthOffset;
|
||||
glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
|
||||
}
|
||||
|
||||
if (oldDepthmapEnable != depthMapEnable) {
|
||||
oldDepthmapEnable = depthMapEnable;
|
||||
glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
|
||||
}
|
||||
|
||||
// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
|
||||
// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
|
||||
glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]);
|
||||
setupUbershaderTexEnv();
|
||||
}
|
||||
|
||||
bindTexturesToSlots();
|
||||
|
||||
if (gpu.fogLUTDirty) {
|
||||
updateFogLUT();
|
||||
}
|
||||
|
@ -532,8 +512,22 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
|
|||
|
||||
setupStencilTest(stencilEnable);
|
||||
|
||||
vbo.bufferVertsSub(vertices);
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
if (!usingAcceleratedShader) {
|
||||
vbo.bufferVertsSub(vertices);
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
} else {
|
||||
if (performIndexedRender) {
|
||||
// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
|
||||
hwIndexBuffer->Bind();
|
||||
glDrawRangeElementsBaseVertex(
|
||||
primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
|
||||
hwIndexBufferOffset, -GLint(minimumIndex)
|
||||
);
|
||||
} else {
|
||||
// When doing non-indexed rendering, just use glDrawArrays
|
||||
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RendererGL::display() {
|
||||
|
@ -840,7 +834,8 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
|
|||
}
|
||||
|
||||
OpenGL::Program& RendererGL::getSpecializedShader() {
|
||||
constexpr uint uboBlockBinding = 2;
|
||||
constexpr uint vsUBOBlockBinding = 1;
|
||||
constexpr uint fsUBOBlockBinding = 2;
|
||||
|
||||
PICA::FragmentConfig fsConfig(regs);
|
||||
// If we're not on GLES, ignore the logic op configuration and don't generate redundant shaders for it, since we use hw logic ops
|
||||
|
@ -848,30 +843,44 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
|
|||
fsConfig.outConfig.logicOpMode = PICA::LogicOpMode(0);
|
||||
#endif
|
||||
|
||||
CachedProgram& programEntry = shaderCache[fsConfig];
|
||||
OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
|
||||
if (!fragShader.exists()) {
|
||||
std::string fs = fragShaderGen.generate(fsConfig);
|
||||
fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
|
||||
}
|
||||
|
||||
// Get the handle of the current vertex shader
|
||||
OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
|
||||
// And form the key for looking up a shader program
|
||||
const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
|
||||
|
||||
CachedProgram& programEntry = shaderCache.programCache[programKey];
|
||||
OpenGL::Program& program = programEntry.program;
|
||||
|
||||
if (!program.exists()) {
|
||||
std::string fs = fragShaderGen.generate(fsConfig, &driverInfo);
|
||||
|
||||
OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
|
||||
program.create({defaultShadergenVs, fragShader});
|
||||
program.create({vertexShader, fragShader});
|
||||
gl.useProgram(program);
|
||||
|
||||
fragShader.free();
|
||||
|
||||
// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
|
||||
glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
|
||||
glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
|
||||
glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
|
||||
glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
|
||||
|
||||
// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
|
||||
// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
|
||||
// As it's an OpenGL 4.2 feature that MacOS doesn't support...
|
||||
uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
|
||||
glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
|
||||
uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
|
||||
glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
|
||||
|
||||
if (usingAcceleratedShader) {
|
||||
uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
|
||||
glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
|
||||
}
|
||||
}
|
||||
glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
|
||||
if (usingAcceleratedShader) {
|
||||
glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
|
||||
}
|
||||
glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);
|
||||
|
||||
// Upload uniform data to our shader's UBO
|
||||
PICA::FragmentUniforms uniforms;
|
||||
|
@ -961,6 +970,101 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
|
|||
return program;
|
||||
}
|
||||
|
||||
bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
|
||||
// First we figure out if we will be using an ubershader
|
||||
bool usingUbershader = emulatorConfig->useUbershaders;
|
||||
if (usingUbershader) {
|
||||
const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
|
||||
const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
|
||||
|
||||
// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
|
||||
// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
|
||||
if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
|
||||
usingUbershader = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
|
||||
// TODO: Ubershader support for accelerated shaders
|
||||
usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
|
||||
|
||||
if (usingAcceleratedShader) {
|
||||
PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
|
||||
|
||||
std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
|
||||
// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
|
||||
if (!shader.has_value()) {
|
||||
// Initialize shader to a "null" shader (handle == 0)
|
||||
shader = OpenGL::Shader();
|
||||
|
||||
std::string picaShaderSource = PICA::ShaderGen::decompileShader(
|
||||
shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
|
||||
Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
|
||||
);
|
||||
|
||||
// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
|
||||
// it to the GPU
|
||||
if (!picaShaderSource.empty()) {
|
||||
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
|
||||
shader->create({vertexShaderSource}, OpenGL::Vertex);
|
||||
}
|
||||
}
|
||||
|
||||
// Shader generation did not work out, so set usingAcceleratedShader to false
|
||||
if (!shader->exists()) {
|
||||
usingAcceleratedShader = false;
|
||||
} else {
|
||||
generatedVertexShader = &(*shader);
|
||||
gl.bindUBO(hwShaderUniformUBO);
|
||||
|
||||
if (shaderUnit.vs.uniformsDirty) {
|
||||
shaderUnit.vs.uniformsDirty = false;
|
||||
glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
|
||||
}
|
||||
|
||||
performIndexedRender = accel->indexed;
|
||||
minimumIndex = GLsizei(accel->minimumIndex);
|
||||
maximumIndex = GLsizei(accel->maximumIndex);
|
||||
|
||||
// Upload vertex data and index buffer data to our GPU
|
||||
accelerateVertexUpload(shaderUnit, accel);
|
||||
}
|
||||
}
|
||||
|
||||
if (!usingUbershader) {
|
||||
OpenGL::Program& program = getSpecializedShader();
|
||||
gl.useProgram(program);
|
||||
} else { // Bind ubershader & load ubershader uniforms
|
||||
gl.useProgram(triangleProgram);
|
||||
|
||||
const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
|
||||
const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
|
||||
const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
|
||||
|
||||
if (oldDepthScale != depthScale) {
|
||||
oldDepthScale = depthScale;
|
||||
glUniform1f(ubershaderData.depthScaleLoc, depthScale);
|
||||
}
|
||||
|
||||
if (oldDepthOffset != depthOffset) {
|
||||
oldDepthOffset = depthOffset;
|
||||
glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
|
||||
}
|
||||
|
||||
if (oldDepthmapEnable != depthMapEnable) {
|
||||
oldDepthmapEnable = depthMapEnable;
|
||||
glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
|
||||
}
|
||||
|
||||
// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
|
||||
// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
|
||||
glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]);
|
||||
setupUbershaderTexEnv();
|
||||
}
|
||||
|
||||
return usingAcceleratedShader;
|
||||
}
|
||||
|
||||
void RendererGL::screenshot(const std::string& name) {
|
||||
constexpr uint width = 400;
|
||||
constexpr uint height = 2 * 240;
|
||||
|
@ -974,7 +1078,7 @@ void RendererGL::screenshot(const std::string& name) {
|
|||
|
||||
// Flip the image vertically
|
||||
for (int y = 0; y < height; y++) {
|
||||
memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
|
||||
std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
|
||||
// Swap R and B channels
|
||||
for (int x = 0; x < width; x++) {
|
||||
std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
|
||||
|
@ -986,21 +1090,12 @@ void RendererGL::screenshot(const std::string& name) {
|
|||
stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
|
||||
}
|
||||
|
||||
void RendererGL::clearShaderCache() {
|
||||
for (auto& shader : shaderCache) {
|
||||
CachedProgram& cachedProgram = shader.second;
|
||||
cachedProgram.program.free();
|
||||
}
|
||||
|
||||
shaderCache.clear();
|
||||
}
|
||||
|
||||
void RendererGL::deinitGraphicsContext() {
|
||||
// Invalidate all surface caches since they'll no longer be valid
|
||||
textureCache.reset();
|
||||
depthBufferCache.reset();
|
||||
colourBufferCache.reset();
|
||||
clearShaderCache();
|
||||
shaderCache.clear();
|
||||
|
||||
// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
|
||||
// TODO: Make it so that depth and colour buffers get written back to 3DS memory
|
||||
|
@ -1049,3 +1144,92 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
|
|||
glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
|
||||
glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
|
||||
}
|
||||
|
||||
void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
|
||||
u32 buffer = 0; // Vertex buffer index for non-fixed attributes
|
||||
u32 attrCount = 0;
|
||||
|
||||
const u32 totalAttribCount = accel->totalAttribCount;
|
||||
|
||||
static constexpr GLenum attributeFormats[4] = {
|
||||
GL_BYTE, // 0: Signed byte
|
||||
GL_UNSIGNED_BYTE, // 1: Unsigned byte
|
||||
GL_SHORT, // 2: Short
|
||||
GL_FLOAT, // 3: Float
|
||||
};
|
||||
|
||||
const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
|
||||
|
||||
// Update index buffer if necessary
|
||||
if (accel->indexed) {
|
||||
usingShortIndices = accel->useShortIndices;
|
||||
const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
|
||||
|
||||
hwIndexBuffer->Bind();
|
||||
auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
|
||||
hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
|
||||
|
||||
std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
|
||||
hwIndexBuffer->Unmap(indexBufferSize);
|
||||
}
|
||||
|
||||
hwVertexBuffer->Bind();
|
||||
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
|
||||
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
|
||||
const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
|
||||
|
||||
gl.bindVAO(hwShaderVAO);
|
||||
|
||||
// Enable or disable vertex attributes as needed
|
||||
const u32 currentAttributeMask = accel->enabledAttributeMask;
|
||||
// Use bitwise xor to calculate which attributes changed
|
||||
u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
|
||||
|
||||
while (attributeMaskDiff != 0) {
|
||||
// Get index of next different attribute and turn it off
|
||||
const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
|
||||
const u32 mask = 1u << index;
|
||||
attributeMaskDiff ^= mask;
|
||||
|
||||
if ((currentAttributeMask & mask) != 0) {
|
||||
// Attribute was disabled and is now enabled
|
||||
hwShaderVAO.enableAttribute(index);
|
||||
} else {
|
||||
// Attribute was enabled and is now disabled
|
||||
hwShaderVAO.disableAttribute(index);
|
||||
}
|
||||
}
|
||||
|
||||
previousAttributeMask = currentAttributeMask;
|
||||
|
||||
// Upload the data for each (enabled) attribute loader into our vertex buffer
|
||||
for (int i = 0; i < accel->totalLoaderCount; i++) {
|
||||
auto& loader = accel->loaders[i];
|
||||
|
||||
std::memcpy(vertexData, loader.data, loader.size);
|
||||
vertexData += loader.size;
|
||||
}
|
||||
|
||||
hwVertexBuffer->Unmap(accel->vertexDataSize);
|
||||
|
||||
// Iterate over the 16 PICA input registers and configure how they should be fetched.
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const auto& attrib = accel->attributeInfo[i];
|
||||
const u32 attributeMask = 1u << i;
|
||||
|
||||
if (accel->fixedAttributes & attributeMask) {
|
||||
auto& attrValue = fixedAttrValues[i];
|
||||
// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
|
||||
if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
|
||||
attrValue[3] != attrib.fixedValue[3]) {
|
||||
std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
|
||||
glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
|
||||
}
|
||||
} else if (accel->enabledAttributeMask & attributeMask) {
|
||||
glVertexAttribPointer(
|
||||
i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
|
||||
reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -163,8 +163,9 @@ static int fetchVariableRange(std::string key, int min, int max) {
|
|||
|
||||
static void configInit() {
|
||||
static const retro_variable values[] = {
|
||||
{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled"
|
||||
: "Enable shader JIT; disabled|enabled"},
|
||||
{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled" : "Enable shader JIT; disabled|enabled"},
|
||||
{"panda3ds_accelerate_shaders",
|
||||
EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
|
||||
{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
|
||||
{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
|
||||
: "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
|
||||
|
@ -197,6 +198,8 @@ static void configUpdate() {
|
|||
config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
|
||||
config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
|
||||
config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
|
||||
config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
|
||||
|
||||
config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
|
||||
config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);
|
||||
config.discordRpcEnabled = false;
|
||||
|
|
288
third_party/duckstation/gl/stream_buffer.cpp
vendored
Normal file
288
third_party/duckstation/gl/stream_buffer.cpp
vendored
Normal file
|
@ -0,0 +1,288 @@
|
|||
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
||||
|
||||
#include "gl/stream_buffer.h"
|
||||
|
||||
#include <array>
|
||||
#include <cstdio>
|
||||
|
||||
#include "align.hpp"
|
||||
|
||||
OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
|
||||
OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
|
||||
|
||||
void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
|
||||
void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
|
||||
|
||||
void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
|
||||
#ifdef GPU_DEBUG_INFO
|
||||
if (glObjectLabel) {
|
||||
glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
|
||||
class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
|
||||
public:
|
||||
~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
|
||||
|
||||
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
|
||||
|
||||
u32 Unmap(u32 used_size) override {
|
||||
if (used_size == 0) return 0;
|
||||
|
||||
glBindBuffer(m_target, m_buffer_id);
|
||||
glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 GetChunkSize() const override { return m_size; }
|
||||
|
||||
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
|
||||
glGetError();
|
||||
|
||||
GLuint buffer_id;
|
||||
glGenBuffers(1, &buffer_id);
|
||||
glBindBuffer(target, buffer_id);
|
||||
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
|
||||
|
||||
GLenum err = glGetError();
|
||||
if (err != GL_NO_ERROR) {
|
||||
glBindBuffer(target, 0);
|
||||
glDeleteBuffers(1, &buffer_id);
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
|
||||
}
|
||||
|
||||
private:
|
||||
BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
|
||||
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
|
||||
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
|
||||
}
|
||||
|
||||
u8* m_cpu_buffer;
|
||||
};
|
||||
|
||||
// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
|
||||
class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
|
||||
public:
|
||||
~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
|
||||
|
||||
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
|
||||
|
||||
u32 Unmap(u32 used_size) override {
|
||||
if (used_size == 0) return 0;
|
||||
|
||||
glBindBuffer(m_target, m_buffer_id);
|
||||
glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 GetChunkSize() const override { return m_size; }
|
||||
|
||||
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
|
||||
glGetError();
|
||||
|
||||
GLuint buffer_id;
|
||||
glGenBuffers(1, &buffer_id);
|
||||
glBindBuffer(target, buffer_id);
|
||||
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
|
||||
|
||||
GLenum err = glGetError();
|
||||
if (err != GL_NO_ERROR) {
|
||||
glBindBuffer(target, 0);
|
||||
glDeleteBuffers(1, &buffer_id);
|
||||
return {};
|
||||
}
|
||||
|
||||
return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
|
||||
}
|
||||
|
||||
private:
|
||||
BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
|
||||
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
|
||||
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
|
||||
}
|
||||
|
||||
u8* m_cpu_buffer;
|
||||
};
|
||||
|
||||
// Base class for implementations which require syncing.
|
||||
class SyncingStreamBuffer : public OpenGLStreamBuffer {
|
||||
public:
|
||||
enum : u32 { NUM_SYNC_POINTS = 16 };
|
||||
|
||||
virtual ~SyncingStreamBuffer() override {
|
||||
for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
|
||||
glDeleteSync(m_sync_objects[i]);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
|
||||
: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
|
||||
|
||||
ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
|
||||
|
||||
ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
|
||||
const u32 end = GetSyncIndexForOffset(offset);
|
||||
for (; m_used_block_index < end; m_used_block_index++) {
|
||||
if (m_sync_objects[m_used_block_index]) {
|
||||
Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
|
||||
}
|
||||
|
||||
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
|
||||
}
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void WaitForSync(GLsync& sync) {
|
||||
glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
|
||||
glDeleteSync(sync);
|
||||
sync = nullptr;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
|
||||
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
|
||||
for (; m_available_block_index < end; m_available_block_index++) {
|
||||
if (!m_sync_objects[m_available_block_index]) [[unlikely]] {
|
||||
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
|
||||
}
|
||||
|
||||
WaitForSync(m_sync_objects[m_available_block_index]);
|
||||
}
|
||||
}
|
||||
|
||||
void AllocateSpace(u32 size) {
|
||||
// add sync objects for writes since the last allocation
|
||||
AddSyncsForOffset(m_position);
|
||||
|
||||
// wait for sync objects for the space we want to use
|
||||
EnsureSyncsWaitedForOffset(m_position + size);
|
||||
|
||||
// wrap-around?
|
||||
if ((m_position + size) > m_size) {
|
||||
// current position ... buffer end
|
||||
AddSyncsForOffset(m_size);
|
||||
|
||||
// rewind, and try again
|
||||
m_position = 0;
|
||||
|
||||
// wait for the sync at the start of the buffer
|
||||
WaitForSync(m_sync_objects[0]);
|
||||
m_available_block_index = 1;
|
||||
|
||||
// and however much more we need to satisfy the allocation
|
||||
EnsureSyncsWaitedForOffset(size);
|
||||
m_used_block_index = 0;
|
||||
}
|
||||
}
|
||||
|
||||
u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
|
||||
|
||||
u32 m_position = 0;
|
||||
u32 m_used_block_index = 0;
|
||||
u32 m_available_block_index = NUM_SYNC_POINTS;
|
||||
u32 m_bytes_per_block;
|
||||
std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
|
||||
};
|
||||
|
||||
class BufferStorageStreamBuffer : public SyncingStreamBuffer {
|
||||
public:
|
||||
~BufferStorageStreamBuffer() override {
|
||||
glBindBuffer(m_target, m_buffer_id);
|
||||
glUnmapBuffer(m_target);
|
||||
glBindBuffer(m_target, 0);
|
||||
}
|
||||
|
||||
MappingResult Map(u32 alignment, u32 min_size) override {
|
||||
if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
|
||||
|
||||
AllocateSpace(min_size);
|
||||
if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
|
||||
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
|
||||
}
|
||||
|
||||
const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
|
||||
return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
|
||||
}
|
||||
|
||||
u32 Unmap(u32 used_size) override {
|
||||
if ((m_position + used_size) > m_size) [[unlikely]] {
|
||||
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
|
||||
}
|
||||
|
||||
if (!m_coherent) {
|
||||
if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
|
||||
glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
|
||||
} else {
|
||||
Bind();
|
||||
glFlushMappedBufferRange(m_target, m_position, used_size);
|
||||
}
|
||||
}
|
||||
|
||||
const u32 prev_position = m_position;
|
||||
m_position += used_size;
|
||||
return prev_position;
|
||||
}
|
||||
|
||||
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
|
||||
glGetError();
|
||||
|
||||
GLuint buffer_id;
|
||||
glGenBuffers(1, &buffer_id);
|
||||
glBindBuffer(target, buffer_id);
|
||||
|
||||
const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
|
||||
const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
|
||||
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
|
||||
glBufferStorage(target, size, nullptr, flags);
|
||||
else if (GLAD_GL_EXT_buffer_storage)
|
||||
glBufferStorageEXT(target, size, nullptr, flags);
|
||||
|
||||
GLenum err = glGetError();
|
||||
if (err != GL_NO_ERROR) {
|
||||
glBindBuffer(target, 0);
|
||||
glDeleteBuffers(1, &buffer_id);
|
||||
return {};
|
||||
}
|
||||
|
||||
u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
|
||||
AssertMsg(mapped_ptr, "Persistent buffer was mapped");
|
||||
|
||||
return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
|
||||
}
|
||||
|
||||
private:
|
||||
BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
|
||||
: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
|
||||
|
||||
u8* m_mapped_ptr;
|
||||
bool m_coherent;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
|
||||
std::unique_ptr<OpenGLStreamBuffer> buf;
|
||||
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
|
||||
buf = BufferStorageStreamBuffer::Create(target, size);
|
||||
if (buf) return buf;
|
||||
}
|
||||
|
||||
// BufferSubData is slower on all drivers except NVIDIA...
|
||||
#if 0
|
||||
const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
|
||||
if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
|
||||
// Mali and Adreno drivers can't do sub-buffer tracking...
|
||||
return BufferDataStreamBuffer::Create(target, size);
|
||||
}
|
||||
|
||||
return BufferSubDataStreamBuffer::Create(target, size);
|
||||
#else
|
||||
return BufferDataStreamBuffer::Create(target, size);
|
||||
#endif
|
||||
}
|
53
third_party/duckstation/gl/stream_buffer.h
vendored
Normal file
53
third_party/duckstation/gl/stream_buffer.h
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <glad/gl.h>
|
||||
// Comment to avoid clang-format reordering the glad header
|
||||
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "duckstation_compat.h"
|
||||
#include "helpers.hpp"
|
||||
|
||||
class OpenGLStreamBuffer {
|
||||
public:
|
||||
virtual ~OpenGLStreamBuffer();
|
||||
|
||||
ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
|
||||
ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
|
||||
ALWAYS_INLINE u32 GetSize() const { return m_size; }
|
||||
|
||||
void Bind();
|
||||
void Unbind();
|
||||
|
||||
void SetDebugName(std::string_view name);
|
||||
|
||||
struct MappingResult {
|
||||
void* pointer;
|
||||
u32 buffer_offset;
|
||||
u32 index_aligned; // offset / alignment, suitable for base vertex
|
||||
u32 space_aligned; // remaining space / alignment
|
||||
};
|
||||
|
||||
virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
|
||||
|
||||
/// Returns the position in the buffer *before* the start of used_size.
|
||||
virtual u32 Unmap(u32 used_size) = 0;
|
||||
|
||||
/// Returns the minimum granularity of blocks which sync objects will be created around.
|
||||
virtual u32 GetChunkSize() const = 0;
|
||||
|
||||
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
|
||||
|
||||
protected:
|
||||
OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
|
||||
|
||||
GLenum m_target;
|
||||
GLuint m_buffer_id;
|
||||
u32 m_size;
|
||||
};
|
1
third_party/fmt
vendored
Submodule
1
third_party/fmt
vendored
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4
|
Loading…
Add table
Reference in a new issue