Merge branch 'master' into metal2

This commit is contained in:
SamoZ256 2024-10-31 13:45:58 +01:00 committed by GitHub
commit 02f8250aff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
69 changed files with 2906 additions and 319 deletions

View file

@ -8,7 +8,7 @@ on:
jobs:
x64:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
strategy:
matrix:
@ -73,7 +73,7 @@ jobs:
./src/pandroid/app/build/outputs/apk/${{ env.BUILD_TYPE }}/app-${{ env.BUILD_TYPE }}.apk
arm64:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
strategy:
matrix:

View file

@ -16,7 +16,7 @@ jobs:
# well on Windows or Mac. You can convert this to a matrix build if you need
# cross-platform coverage.
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

View file

@ -98,7 +98,7 @@ jobs:
${{github.workspace}}/docs/libretro/panda3ds_libretro.info
Linux:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -107,7 +107,7 @@ jobs:
- name: Install misc packages
run: |
sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
- name: Install newer Clang
run: |
@ -151,7 +151,7 @@ jobs:
${{github.workspace}}/docs/libretro/panda3ds_libretro.info
Android-x64:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -160,7 +160,7 @@ jobs:
- name: Install misc packages
run: |
sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev
sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev
- name: Setup Vulkan SDK
uses: humbletim/setup-vulkan-sdk@v1.2.0

View file

@ -16,7 +16,7 @@ jobs:
# well on Windows or Mac. You can convert this to a matrix build if you need
# cross-platform coverage.
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -24,7 +24,7 @@ jobs:
run: git submodule update --init --recursive
- name: Install misc packages
run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2
run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2
- name: Install newer Clang
run: |

View file

@ -16,7 +16,7 @@ jobs:
# well on Windows or Mac. You can convert this to a matrix build if you need
# cross-platform coverage.
# See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -24,7 +24,7 @@ jobs:
run: git submodule update --init --recursive
- name: Install misc packages
run: sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev
run: sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev
- name: Install newer Clang
run: |

View file

@ -96,7 +96,7 @@ jobs:
path: 'Alber.zip'
Linux:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
@ -105,8 +105,7 @@ jobs:
- name: Install misc packages
run: |
sudo apt-get update && sudo apt install libx11-dev libgl1-mesa-glx mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
sudo add-apt-repository -y ppa:savoury1/qt-6-2
sudo apt-get update && sudo apt install libx11-dev libgl1 libglx-mesa0 mesa-common-dev libfuse2 libwayland-dev libgl1-mesa-dev
sudo apt update
sudo apt install qt6-base-dev qt6-base-private-dev

3
.gitmodules vendored
View file

@ -76,6 +76,9 @@
[submodule "third_party/metal-cpp"]
path = third_party/metal-cpp
url = https://github.com/Panda3DS-emu/metal-cpp
[submodule "third_party/fmt"]
path = third_party/fmt
url = https://github.com/fmtlib/fmt
[submodule "third_party/fdk-aac"]
path = third_party/fdk-aac
url = https://github.com/Panda3DS-emu/fdk-aac/

View file

@ -26,7 +26,7 @@ if(APPLE)
endif()
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format-nonliteral -Wno-format-security -Wno-invalid-offsetof")
endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@ -55,6 +55,7 @@ option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON)
option(BUILD_HYDRA_CORE "Build a Hydra core" OFF)
option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF)
option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON)
option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF)
set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.")
set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES)
@ -147,11 +148,13 @@ if (NOT ANDROID)
target_link_libraries(AlberCore PUBLIC SDL2-static)
endif()
add_subdirectory(third_party/fmt)
add_subdirectory(third_party/toml11)
include_directories(${SDL2_INCLUDE_DIR})
include_directories(third_party/toml11)
include_directories(third_party/glm)
include_directories(third_party/renderdoc)
include_directories(third_party/duckstation)
add_subdirectory(third_party/cmrc)
@ -210,6 +213,13 @@ else()
set(HOST_ARM64 FALSE)
endif()
# Enable SSE4.1 if it's not explicitly disabled
# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1
if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
endif()
if(ENABLE_RENDERDOC_API)
find_package(RenderDoc 1.6.0 MODULE REQUIRED)
add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC)
@ -258,13 +268,13 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services
src/core/services/act.cpp src/core/services/nfc.cpp src/core/services/dlp_srvr.cpp
src/core/services/ir_user.cpp src/core/services/http.cpp src/core/services/soc.cpp
src/core/services/ssl.cpp src/core/services/news_u.cpp src/core/services/amiibo_device.cpp
src/core/services/csnd.cpp src/core/services/nwm_uds.cpp
src/core/services/csnd.cpp src/core/services/nwm_uds.cpp src/core/services/fonts.cpp
)
set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp
src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
src/core/PICA/shader_decompiler.cpp
src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
)
set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@ -316,14 +326,15 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
include/sdl_sensors.hpp include/renderdoc.hpp include/audio/aac_decoder.hpp
include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
)
cmrc_add_resource_library(
resources_console_fonts
NAMESPACE ConsoleFonts
WHENCE "src/core/services/fonts/"
"src/core/services/fonts/CitraSharedFontUSRelocated.bin"
"src/core/services/fonts/SharedFontReplacement.bin"
)
set(THIRD_PARTY_SOURCE_FILES third_party/imgui/imgui.cpp
@ -349,7 +360,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
endif()
if(ENABLE_QT_GUI)
include_directories(third_party/duckstation)
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)
if(APPLE)
@ -382,7 +392,7 @@ if(ENABLE_OPENGL)
set(RENDERER_GL_INCLUDE_FILES third_party/opengl/opengl.hpp
include/renderer_gl/renderer_gl.hpp include/renderer_gl/textures.hpp
include/renderer_gl/surfaces.hpp include/renderer_gl/surface_cache.hpp
include/renderer_gl/gl_state.hpp
include/renderer_gl/gl_state.hpp include/renderer_gl/gl_driver.hpp
)
set(RENDERER_GL_SOURCE_FILES src/core/renderer_gl/renderer_gl.cpp
@ -392,6 +402,8 @@ if(ENABLE_OPENGL)
src/host_shaders/opengl_fragment_shader.frag
)
set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
@ -555,7 +567,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN
target_sources(AlberCore PRIVATE ${ALL_SOURCES})
target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra fdk-aac)
target_link_libraries(AlberCore PUBLIC glad capstone)
target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)
if(ENABLE_DISCORD_RPC AND NOT ANDROID)
target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")

BIN
docs/img/KirbyAndroid.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 567 KiB

View file

@ -0,0 +1,45 @@
#pragma once
#include <array>
#include "helpers.hpp"
namespace PICA {
struct DrawAcceleration {
static constexpr u32 maxAttribCount = 16;
static constexpr u32 maxLoaderCount = 12;
struct AttributeInfo {
u32 offset;
u32 stride;
u8 type;
u8 componentCount;
std::array<float, 4> fixedValue; // For fixed attributes
};
struct Loader {
// Data to upload for this loader
u8* data;
usize size;
};
u8* indexBuffer;
// Minimum and maximum index in the index buffer for a draw call
u16 minimumIndex, maximumIndex;
u32 totalAttribCount;
u32 totalLoaderCount;
u32 enabledAttributeMask;
u32 fixedAttributes;
u32 vertexDataSize;
std::array<AttributeInfo, maxAttribCount> attributeInfo;
std::array<Loader, maxLoaderCount> loaders;
bool canBeAccelerated;
bool indexed;
bool useShortIndices;
};
} // namespace PICA

View file

@ -2,7 +2,7 @@
#include "helpers.hpp"
#include "vertex_loader_rec.hpp"
// Common file for our PICA JITs (From vertex config -> CPU assembly and from PICA shader -> CPU assembly)
// Common file for our PICA JITs (From PICA shader -> CPU assembly)
namespace Dynapica {
#ifdef PANDA3DS_DYNAPICA_SUPPORTED

View file

@ -1,6 +1,7 @@
#pragma once
#include <array>
#include "PICA/draw_acceleration.hpp"
#include "PICA/dynapica/shader_rec.hpp"
#include "PICA/float_types.hpp"
#include "PICA/pica_vertex.hpp"
@ -13,6 +14,12 @@
#include "memory.hpp"
#include "renderer.hpp"
enum class ShaderExecMode {
Interpreter, // Interpret shaders on the CPU
JIT, // Recompile shaders to CPU machine code
Hardware, // Recompiler shaders to host shaders and run them on the GPU
};
class GPU {
static constexpr u32 regNum = 0x300;
static constexpr u32 extRegNum = 0x1000;
@ -45,7 +52,7 @@ class GPU {
uint immediateModeVertIndex;
uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
template <bool indexed, bool useShaderJIT>
template <bool indexed, ShaderExecMode mode>
void drawArrays();
// Silly method of avoiding linking problems. TODO: Change to something less silly
@ -81,6 +88,7 @@ class GPU {
std::unique_ptr<Renderer> renderer;
PICA::Vertex getImmediateModeVertex();
void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
public:
// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
// Encoded in PICA native format

View file

@ -17,6 +17,7 @@ namespace PICA {
// enable == off means a CompareFunction of Always
BitField<0, 3, CompareFunction> alphaTestFunction;
BitField<3, 1, u32> depthMapEnable;
BitField<4, 4, LogicOpMode> logicOpMode;
};
};
@ -214,6 +215,10 @@ namespace PICA {
(alphaTestConfig & 1) ? static_cast<PICA::CompareFunction>(alphaTestFunction) : PICA::CompareFunction::Always;
outConfig.depthMapEnable = regs[InternalRegs::DepthmapEnable] & 1;
// Shows if blending is enabled. If it is not enabled, then logic ops are enabled instead
const bool blendingEnabled = (regs[InternalRegs::ColourOperation] & (1 << 8)) != 0;
outConfig.logicOpMode = blendingEnabled ? LogicOpMode::Copy : LogicOpMode(Helpers::getBits<0, 4>(regs[InternalRegs::LogicOp]));
texConfig.texUnitConfig = regs[InternalRegs::TexUnitCfg];
texConfig.texEnvUpdateBuffer = regs[InternalRegs::TexEnvUpdateBuffer];

274
include/PICA/pica_simd.hpp Normal file
View file

@ -0,0 +1,274 @@
#pragma once
#include <algorithm>
#include <limits>
#include <utility>
#include "helpers.hpp"
#if defined(_M_AMD64) || defined(__x86_64__)
#define PICA_SIMD_X64
#include <immintrin.h>
#elif defined(_M_ARM64) || defined(__aarch64__)
#define PICA_SIMD_ARM64
#include <arm_neon.h>
#endif
// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
namespace PICA::IndexBuffer {
// Non-SIMD, portable algorithm
template <bool useShortIndices>
std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
u16 minimumIndex = std::numeric_limits<u16>::max();
u16 maximumIndex = 0;
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
if constexpr (useShortIndices) {
u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
for (u32 i = 0; i < vertexCount; i++) {
u16 index = indexBuffer16[i];
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
}
} else {
for (u32 i = 0; i < vertexCount; i++) {
u16 index = u16(indexBuffer[i]);
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
}
}
return {minimumIndex, maximumIndex};
}
#ifdef PICA_SIMD_ARM64
template <bool useShortIndices>
std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
if (vertexCount < vertsPerLoop) {
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
}
u16 minimumIndex, maximumIndex;
if constexpr (useShortIndices) {
// 16-bit indices
uint16x8_t minima = vdupq_n_u16(0xffff);
uint16x8_t maxima = vdupq_n_u16(0);
while (vertexCount >= vertsPerLoop) {
const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
minima = vminq_u16(data, minima);
maxima = vmaxq_u16(data, maxima);
indexBuffer += 16;
vertexCount -= vertsPerLoop;
}
// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
} else {
// 8-bit indices
uint8x16_t minima = vdupq_n_u8(0xff);
uint8x16_t maxima = vdupq_n_u8(0);
while (vertexCount >= vertsPerLoop) {
uint8x16_t data = vld1q_u8(indexBuffer);
minima = vminq_u8(data, minima);
maxima = vmaxq_u8(data, maxima);
indexBuffer += 16;
vertexCount -= vertsPerLoop;
}
// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
}
// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
while (vertexCount > 0) {
if constexpr (useShortIndices) {
u16 index = *reinterpret_cast<u16*>(indexBuffer);
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
indexBuffer += 2;
} else {
u16 index = u16(*indexBuffer++);
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
}
vertexCount -= 1;
}
return {minimumIndex, maximumIndex};
}
#endif
#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
template <bool useShortIndices>
std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
// We process 16 bytes per iteration, which is 8 vertices if we're using u16
// indices or 16 vertices if we're using u8 indices
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
if (vertexCount < vertsPerLoop) {
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
}
u16 minimumIndex, maximumIndex;
if constexpr (useShortIndices) {
// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
// Based on https://stackoverflow.com/a/22259607
auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
auto horizontalMax16 = [](__m128i vector) -> u16 {
// We have an instruction to compute horizontal minimum but not maximum, so we use it.
// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
return u16(min ^ 0xffff);
};
// 16-bit indices
// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
// And the maxima vector to all 0s (0 for each 16-bit lane)
__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
while (vertexCount >= vertsPerLoop) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
minima = _mm_min_epu16(data, minima);
maxima = _mm_max_epu16(data, maxima);
indexBuffer += 16;
vertexCount -= vertsPerLoop;
}
minimumIndex = u16(horizontalMin16(minima));
maximumIndex = u16(horizontalMax16(maxima));
} else {
// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
// Based on https://stackoverflow.com/a/22259607
auto horizontalMin8 = [](__m128i vector) -> u8 {
vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
return u8(_mm_cvtsi128_si32(vector));
};
auto horizontalMax8 = [](__m128i vector) -> u8 {
vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
return u8(_mm_cvtsi128_si32(vector));
};
// 8-bit indices
// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
// And the maxima vector to all 0s (0 for each 8-bit lane)
__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
while (vertexCount >= vertsPerLoop) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
minima = _mm_min_epu8(data, minima);
maxima = _mm_max_epu8(data, maxima);
indexBuffer += 16;
vertexCount -= vertsPerLoop;
}
minimumIndex = u16(horizontalMin8(minima));
maximumIndex = u16(horizontalMax8(maxima));
}
// If any indices could not be processed cause the buffer size
// is not 16-byte aligned, process them the naive way
// Calculate the minimum and maximum indices used in the index
// buffer, so we'll only upload them
while (vertexCount > 0) {
if constexpr (useShortIndices) {
u16 index = *reinterpret_cast<u16*>(indexBuffer);
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
indexBuffer += 2;
} else {
u16 index = u16(*indexBuffer++);
minimumIndex = std::min(minimumIndex, index);
maximumIndex = std::max(maximumIndex, index);
}
vertexCount -= 1;
}
return {minimumIndex, maximumIndex};
}
#endif
// Analyzes a PICA index buffer to get the minimum and maximum indices in the
// buffer, and returns them in a pair in the form [min, max]. Takes a template
// parameter to decide whether the indices in the buffer are u8 or u16
template <bool useShortIndices>
std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
#if defined(PICA_SIMD_ARM64)
return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
#else
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
#endif
}
// In some really unfortunate scenarios (eg Android Studio emulator), we don't have access to glDrawRangeElementsBaseVertex
// So we need to subtract the base vertex index from every index in the index buffer ourselves
// This is not really common, so we do it without SIMD for the moment, just to be able to run on Android Studio
template <bool useShortIndices>
void subtractBaseIndex(u8* indexBuffer, u32 indexCount, u16 baseIndex) {
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
if constexpr (useShortIndices) {
u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
for (u32 i = 0; i < indexCount; i++) {
indexBuffer16[i] -= baseIndex;
}
} else {
u8 baseIndex8 = u8(baseIndex);
for (u32 i = 0; i < indexCount; i++) {
indexBuffer[i] -= baseIndex8;
}
}
}
} // namespace PICA::IndexBuffer

View file

@ -0,0 +1,57 @@
#pragma once
#include <array>
#include <cassert>
#include <cstring>
#include <type_traits>
#include <unordered_map>
#include "PICA/pica_hash.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader.hpp"
#include "bitfield.hpp"
#include "helpers.hpp"
namespace PICA {
// Configuration struct used
struct VertConfig {
PICAHash::HashType shaderHash;
PICAHash::HashType opdescHash;
u32 entrypoint;
// PICA registers for configuring shader output->fragment semantic mapping
std::array<u32, 7> outmaps{};
u16 outputMask;
u8 outputCount;
bool usingUbershader;
// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
// As the padding will get hashed and memcmp'd...
u32 pad{};
bool operator==(const VertConfig& config) const {
// Hash function and equality operator required by std::unordered_map
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
}
VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
shaderHash = shader.getCodeHash();
opdescHash = shader.getOpdescHash();
entrypoint = shader.entrypoint;
outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
for (int i = 0; i < outputCount; i++) {
// Mask out unused bits
outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
}
}
};
} // namespace PICA
static_assert(sizeof(PICA::VertConfig) == 56);
// Override std::hash for our vertex config class
template <>
struct std::hash<PICA::VertConfig> {
std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
};

View file

@ -396,6 +396,25 @@ namespace PICA {
GreaterOrEqual = 7,
};
enum class LogicOpMode : u32 {
Clear = 0,
And = 1,
ReverseAnd = 2,
Copy = 3,
Set = 4,
InvertedCopy = 5,
Nop = 6,
Invert = 7,
Nand = 8,
Or = 9,
Nor = 10,
Xor = 11,
Equiv = 12,
InvertedAnd = 13,
ReverseOr = 14,
InvertedOr = 15,
};
enum class FogMode : u32 {
Disabled = 0,
Fog = 5,

View file

@ -107,6 +107,11 @@ class PICAShader {
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
alignas(16) std::array<vec4f, 16> outputs;
alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
using Hash = PICAHash::HashType;
protected:
std::array<u32, 128> operandDescriptors;
@ -125,14 +130,13 @@ class PICAShader {
std::array<CallInfo, 4> callInfo;
ShaderType type;
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
using Hash = PICAHash::HashType;
Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT)
public:
bool uniformsDirty = false;
protected:
bool codeHashDirty = false;
bool opdescHashDirty = false;
@ -284,6 +288,7 @@ class PICAShader {
uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
}
uniformsDirty = true;
}
}
@ -295,6 +300,12 @@ class PICAShader {
u[1] = getBits<8, 8>(word);
u[2] = getBits<16, 8>(word);
u[3] = getBits<24, 8>(word);
uniformsDirty = true;
}
void uploadBoolUniform(u32 value) {
boolUniform = value;
uniformsDirty = true;
}
void run();
@ -302,6 +313,10 @@ class PICAShader {
Hash getCodeHash();
Hash getOpdescHash();
// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
};
static_assert(

View file

@ -1,8 +1,11 @@
#pragma once
#include <fmt/format.h>
#include <map>
#include <set>
#include <string>
#include <tuple>
#include <map>
#include <utility>
#include <vector>
#include "PICA/shader.hpp"
@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
explicit Function(u32 start, u32 end) : start(start), end(end) {}
bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
std::string getCallStatement() const { return getIdentifier() + "()"; }
std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
// from within functions deep in the callstack
std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
};
std::set<Function> functions{};
@ -93,9 +99,11 @@ namespace PICA::ShaderGen {
API api;
Language language;
bool compilationError = false;
void compileInstruction(u32& pc, bool& finished);
void compileRange(const AddressRange& range);
// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
std::pair<u32, bool> compileRange(const AddressRange& range);
void callFunction(const Function& function);
const Function* findFunction(const AddressRange& range);
@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
std::string getDest(u32 dest) const;
std::string getSwizzlePattern(u32 swizzle) const;
std::string getDestSwizzle(u32 destinationMask) const;
const char* getCondition(u32 cond, u32 refX, u32 refY);
void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
// Returns if the instruction uses the typical register encodings most instructions use

View file

@ -3,6 +3,7 @@
#include "PICA/gpu.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen_types.hpp"
#include "helpers.hpp"
@ -25,11 +26,14 @@ namespace PICA::ShaderGen {
bool isSamplerEnabled(u32 environmentID, u32 lutID);
void compileFog(std::string& shader, const PICA::FragmentConfig& config);
void compileLogicOps(std::string& shader, const PICA::FragmentConfig& config);
public:
FragmentGenerator(API api, Language language) : api(api), language(language) {}
std::string generate(const PICA::FragmentConfig& config);
std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
std::string getDefaultVertexShader();
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
void setTarget(API api, Language language) {
this->api = api;

View file

@ -2,10 +2,9 @@
#include "PICA/shader.hpp"
class ShaderUnit {
public:
PICAShader vs; // Vertex shader
PICAShader gs; // Geometry shader
public:
PICAShader vs; // Vertex shader
PICAShader gs; // Geometry shader
ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
void reset();

99
include/align.hpp Normal file
View file

@ -0,0 +1,99 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include <cstdlib>
#include "helpers.hpp"
#ifdef _WIN32
#include <malloc.h>
#endif
namespace Common {
template <typename T>
constexpr bool isAligned(T value, unsigned int alignment) {
return (value % static_cast<T>(alignment)) == 0;
}
template <typename T>
constexpr T alignUp(T value, unsigned int alignment) {
return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
}
template <typename T>
constexpr T alignDown(T value, unsigned int alignment) {
return value / static_cast<T>(alignment) * static_cast<T>(alignment);
}
template <typename T>
constexpr bool isAlignedPow2(T value, unsigned int alignment) {
return (value & static_cast<T>(alignment - 1)) == 0;
}
template <typename T>
constexpr T alignUpPow2(T value, unsigned int alignment) {
return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
}
template <typename T>
constexpr T alignDownPow2(T value, unsigned int alignment) {
return value & static_cast<T>(~static_cast<T>(alignment - 1));
}
template <typename T>
constexpr bool isPow2(T value) {
return (value & (value - 1)) == 0;
}
template <typename T>
constexpr T previousPow2(T value) {
if (value == static_cast<T>(0)) return 0;
value |= (value >> 1);
value |= (value >> 2);
value |= (value >> 4);
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
return value - (value >> 1);
}
template <typename T>
constexpr T nextPow2(T value) {
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
if (value == static_cast<T>(0)) return 0;
value--;
value |= (value >> 1);
value |= (value >> 2);
value |= (value >> 4);
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
value++;
return value;
}
ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
// Unaligned sizes are slow on macOS.
#ifdef __APPLE__
if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
#endif
void* ret = nullptr;
return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
#endif
}
ALWAYS_INLINE static void alignedFree(void* ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
} // namespace Common

View file

@ -324,8 +324,8 @@ namespace Audio::HLE {
BitField<15, 1, u32> outputBufferCountDirty;
BitField<16, 1, u32> masterVolumeDirty;
BitField<24, 1, u32> auxReturnVolume0Dirty;
BitField<25, 1, u32> auxReturnVolume1Dirty;
BitField<24, 1, u32> auxVolume0Dirty;
BitField<25, 1, u32> auxVolume1Dirty;
BitField<26, 1, u32> outputFormatDirty;
BitField<27, 1, u32> clippingModeDirty;
BitField<28, 1, u32> headphonesConnectedDirty;
@ -337,7 +337,7 @@ namespace Audio::HLE {
/// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for
/// each at the final mixer.
float_le masterVolume;
std::array<float_le, 2> auxReturnVolume;
std::array<float_le, 2> auxVolumes;
u16_le outputBufferCount;
u16 pad1[2];
@ -422,7 +422,7 @@ namespace Audio::HLE {
struct DspStatus {
u16_le unknown;
u16_le dropped_frames;
u16_le droppedFrames;
u16 pad0[0xE];
};
ASSERT_DSP_STRUCT(DspStatus, 32);

View file

@ -95,8 +95,7 @@ namespace Audio {
DSPSource() { reset(); }
};
class HLE_DSP : public DSPCore {
// The audio frame types are public in case we want to use them for unit tests
class DSPMixer {
public:
template <typename T, usize channelCount = 1>
using Sample = std::array<T, channelCount>;
@ -113,6 +112,43 @@ namespace Audio {
template <typename T>
using QuadFrame = Frame<T, 4>;
private:
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
static constexpr usize mixerStageCount = 3;
public:
ChannelFormat channelFormat = ChannelFormat::Stereo;
std::array<float, mixerStageCount> volumes;
std::array<bool, 2> enableAuxStages;
void reset() {
channelFormat = ChannelFormat::Stereo;
volumes.fill(0.0);
enableAuxStages.fill(false);
}
};
class HLE_DSP : public DSPCore {
// The audio frame types are public in case we want to use them for unit tests
public:
template <typename T, usize channelCount = 1>
using Sample = DSPMixer::Sample<T, channelCount>;
template <typename T, usize channelCount>
using Frame = DSPMixer::Frame<T, channelCount>;
template <typename T>
using MonoFrame = DSPMixer::MonoFrame<T>;
template <typename T>
using StereoFrame = DSPMixer::StereoFrame<T>;
template <typename T>
using QuadFrame = DSPMixer::QuadFrame<T>;
using Source = Audio::DSPSource;
using SampleBuffer = Source::SampleBuffer;
@ -131,6 +167,7 @@ namespace Audio {
std::array<Source, Audio::HLE::sourceCount> sources; // DSP voices
Audio::HLE::DspMemory dspRam;
Audio::DSPMixer mixer;
std::unique_ptr<Audio::AAC::Decoder> aacDecoder;
void resetAudioPipe();
@ -175,10 +212,13 @@ namespace Audio {
void handleAACRequest(const AAC::Message& request);
void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
void updateMixerConfig(HLE::SharedMemory& sharedMem);
void generateFrame(StereoFrame<s16>& frame);
void generateFrame(DSPSource& source);
void outputFrame();
// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
// Decode an entire buffer worth of audio
void decodeBuffer(DSPSource& source);

View file

@ -20,18 +20,20 @@ struct EmulatorConfig {
#else
static constexpr bool ubershaderDefault = true;
#endif
static constexpr bool accelerateShadersDefault = true;
bool shaderJitEnabled = shaderJitDefault;
bool discordRpcEnabled = false;
bool useUbershaders = ubershaderDefault;
bool accelerateShaders = accelerateShadersDefault;
bool accurateShaderMul = false;
bool discordRpcEnabled = false;
// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
bool forceShadergenForLights = true;
int lightShadergenThreshold = 1;
RendererType rendererType = RendererType::OpenGL;
Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;
Audio::DSPCore::Type dspType = Audio::DSPCore::Type::HLE;
bool sdCardInserted = true;
bool sdWriteProtected = false;

View file

@ -298,5 +298,5 @@ private:
bool allocateMainThreadStack(u32 size);
Regions getConsoleRegion();
void copySharedFont(u8* ptr);
void copySharedFont(u8* ptr, u32 vaddr);
};

View file

@ -35,4 +35,35 @@ namespace Renderdoc {
static void setOutputDir(const std::string& path, const std::string& prefix) {}
static constexpr bool isSupported() { return false; }
} // namespace Renderdoc
#endif
#endif
namespace Renderdoc {
// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
struct Scope {
Scope() { Renderdoc::startCapture(); }
~Scope() { Renderdoc::endCapture(); }
Scope(const Scope&) = delete;
Scope& operator=(const Scope&) = delete;
Scope(Scope&&) = delete;
Scope& operator=(const Scope&&) = delete;
};
// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
// trigger on its own and take a capture
struct InstantScope {
InstantScope() {
Renderdoc::triggerCapture();
Renderdoc::startCapture();
}
~InstantScope() { Renderdoc::endCapture(); }
InstantScope(const InstantScope&) = delete;
InstantScope& operator=(const InstantScope&) = delete;
InstantScope(InstantScope&&) = delete;
InstantScope& operator=(const InstantScope&&) = delete;
};
} // namespace Renderdoc

View file

@ -1,9 +1,10 @@
#pragma once
#include <array>
#include <optional>
#include <span>
#include <string>
#include <optional>
#include "PICA/draw_acceleration.hpp"
#include "PICA/pica_vertex.hpp"
#include "PICA/regs.hpp"
#include "helpers.hpp"
@ -22,9 +23,11 @@ enum class RendererType : s8 {
};
struct EmulatorConfig;
class GPU;
struct SDL_Window;
class GPU;
class ShaderUnit;
class Renderer {
protected:
GPU& gpu;
@ -78,7 +81,11 @@ class Renderer {
virtual std::string getUbershader() { return ""; }
virtual void setUbershader(const std::string& shader) {}
virtual void setUbershaderSetting(bool value) {}
// This function is called on every draw call before parsing vertex data.
// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
// ubershaders and shadergen, and so on.
// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
#ifdef PANDA3DS_FRONTEND_QT

View file

@ -0,0 +1,12 @@
#pragma once
// Information about our OpenGL/OpenGL ES driver that we should keep track of
// Stuff like whether specific extensions are supported, and potentially things like OpenGL context information
namespace OpenGL {
struct Driver {
bool supportsExtFbFetch = false;
bool supportsArmFbFetch = false;
bool supportFbFetch() const { return supportsExtFbFetch || supportsArmFbFetch; }
};
} // namespace OpenGL

View file

@ -38,7 +38,6 @@ struct GLStateManager {
GLuint stencilMask;
GLuint boundVAO;
GLuint boundVBO;
GLuint currentProgram;
GLuint boundUBO;
@ -173,13 +172,6 @@ struct GLStateManager {
}
}
void bindVBO(GLuint handle) {
if (boundVBO != handle) {
boundVBO = handle;
glBindBuffer(GL_ARRAY_BUFFER, handle);
}
}
void useProgram(GLuint handle) {
if (currentProgram != handle) {
currentProgram = handle;
@ -195,7 +187,6 @@ struct GLStateManager {
}
void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }
void setColourMask(bool r, bool g, bool b, bool a) {

View file

@ -3,15 +3,21 @@
#include <array>
#include <cstring>
#include <functional>
#include <memory>
#include <optional>
#include <span>
#include <unordered_map>
#include <utility>
#include "PICA/float_types.hpp"
#include "PICA/pica_frag_config.hpp"
#include "PICA/pica_hash.hpp"
#include "PICA/pica_vert_config.hpp"
#include "PICA/pica_vertex.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen.hpp"
#include "gl/stream_buffer.h"
#include "gl_driver.hpp"
#include "gl_state.hpp"
#include "helpers.hpp"
#include "logger.hpp"
@ -28,9 +34,11 @@ class RendererGL final : public Renderer {
OpenGL::Program triangleProgram;
OpenGL::Program displayProgram;
OpenGL::VertexArray vao;
// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
OpenGL::VertexArray defaultVAO;
// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
OpenGL::VertexArray hwShaderVAO;
OpenGL::VertexBuffer vbo;
bool enableUbershader = true;
// Data
struct {
@ -53,6 +61,21 @@ class RendererGL final : public Renderer {
float oldDepthScale = -1.0;
float oldDepthOffset = 0.0;
bool oldDepthmapEnable = false;
// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
bool usingAcceleratedShader = false;
bool performIndexedRender = false;
bool usingShortIndices = false;
// Set by prepareForDraw, metadata for indexed renders
GLuint minimumIndex = 0;
GLuint maximumIndex = 0;
void* hwIndexBufferOffset = nullptr;
// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
u32 previousAttributeMask = 0;
// Cached pointer to the current vertex shader when using HW accelerated shaders
OpenGL::Shader* generatedVertexShader = nullptr;
SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@ -70,18 +93,58 @@ class RendererGL final : public Renderer {
// We can compile this once and then link it with all other generated fragment shaders
OpenGL::Shader defaultShadergenVs;
GLuint shadergenFragmentUBO;
// UBO for uploading the PICA uniforms when using hw shaders
GLuint hwShaderUniformUBO;
using StreamBuffer = OpenGLStreamBuffer;
std::unique_ptr<StreamBuffer> hwVertexBuffer;
std::unique_ptr<StreamBuffer> hwIndexBuffer;
// Cache of fixed attribute values so that we don't do any duplicate updates
std::array<std::array<float, 4>, 16> fixedAttrValues;
// Cached recompiled fragment shader
struct CachedProgram {
OpenGL::Program program;
};
std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
struct ShaderCache {
std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
// Program cache indexed by GLuints for the vertex and fragment shader to use
// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
std::unordered_map<u64, CachedProgram> programCache;
void clear() {
for (auto& it : programCache) {
CachedProgram& cachedProgram = it.second;
cachedProgram.program.free();
}
for (auto& it : vertexShaderCache) {
if (it.second.has_value()) {
it.second->free();
}
}
for (auto& it : fragmentShaderCache) {
it.second.free();
}
programCache.clear();
vertexShaderCache.clear();
fragmentShaderCache.clear();
}
};
ShaderCache shaderCache;
OpenGL::Framebuffer getColourFBO();
OpenGL::Texture getTexture(Texture& tex);
OpenGL::Program& getSpecializedShader();
PICA::ShaderGen::FragmentGenerator fragShaderGen;
OpenGL::Driver driverInfo;
MAKE_LOG_FUNCTION(log, rendererLogger)
void setupBlending();
@ -93,6 +156,8 @@ class RendererGL final : public Renderer {
void updateFogLUT();
void initGraphicsContextInternal();
void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
public:
RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@ -110,15 +175,13 @@ class RendererGL final : public Renderer {
virtual bool supportsShaderReload() override { return true; }
virtual std::string getUbershader() override;
virtual void setUbershader(const std::string& shader) override;
virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
// Note: The caller is responsible for deleting the currently bound FBO before calling this
void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
void resetStateManager() { gl.reset(); }
void clearShaderCache();
void initUbershader(OpenGL::Program& program);
#ifdef PANDA3DS_FRONTEND_QT

View file

@ -2,31 +2,37 @@
#include <cmath>
#include <glm/glm.hpp>
#include <numbers>
#include "helpers.hpp"
#include "services/hid.hpp"
// Convert SDL sensor readings to 3DS format
// We use the same code for Android as well, since the values we get from Android are in the same format as SDL (m/s^2 for acceleration, rad/s for
// rotation)
namespace Sensors::SDL {
// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
// Returns [pitch, roll, yaw]
static glm::vec3 convertRotation(glm::vec3 rotation) {
// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
constexpr float scale = 180.f / std::numbers::pi * HIDService::gyroscopeCoeff;
// The axes are also inverted, so invert scale before the multiplication.
return rotation * -scale;
}
// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
// Returns [pitch, roll, yaw]
static glm::vec3 convertRotation(glm::vec3 rotation) {
// Annoyingly, Android doesn't support the <numbers> header yet so we define pi ourselves
static constexpr double pi = 3.141592653589793;
// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
constexpr float scale = 180.f / pi * HIDService::gyroscopeCoeff;
// The axes are also inverted, so invert scale before the multiplication.
return rotation * -scale;
}
static glm::vec3 convertAcceleration(float* data) {
// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
static constexpr float accelMax = 9.f;
static glm::vec3 convertAcceleration(float* data) {
// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
static constexpr float accelMax = 9.f;
// We define standard gravity(g) ourself instead of using the SDL one in order for the code to work on Android too.
static constexpr float standardGravity = 9.80665f;
s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
s16 y = std::clamp<s16>(s16(data[1] / (SDL_STANDARD_GRAVITY * accelMax) * 930.f - 350.f), -930, +930);
s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
s16 y = std::clamp<s16>(s16(data[1] / (standardGravity * accelMax) * 930.f - 350.f), -930, +930);
s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
return glm::vec3(x, y, z);
}
return glm::vec3(x, y, z);
}
} // namespace Sensors::SDL

View file

@ -0,0 +1,84 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.h
#pragma once
#include <memory>
#include "helpers.hpp"
#include "swap.hpp"
namespace HLE::Fonts {
struct CFNT {
u8 magic[4];
u16_le endianness;
u16_le headerSize;
u32_le version;
u32_le fileSize;
u32_le numBlocks;
};
struct SectionHeader {
u8 magic[4];
u32_le sectionSize;
};
struct FINF {
u8 magic[4];
u32_le sectionSize;
u8 fontType;
u8 lineFeed;
u16_le alterCharIndex;
u8 default_width[3];
u8 encoding;
u32_le tglpOffset;
u32_le cwdhOffset;
u32_le cmapOffset;
u8 height;
u8 width;
u8 ascent;
u8 reserved;
};
struct TGLP {
u8 magic[4];
u32_le sectionSize;
u8 cellWidth;
u8 cellHeight;
u8 baselinePosition;
u8 maxCharacterWidth;
u32_le sheetSize;
u16_le numSheets;
u16_le sheetImageFormat;
u16_le numColumns;
u16_le numRows;
u16_le sheetWidth;
u16_le sheetHeight;
u32_le sheetDataOffset;
};
struct CMAP {
u8 magic[4];
u32_le sectionSize;
u16_le codeBegin;
u16_le codeEnd;
u16_le mappingMethod;
u16_le reserved;
u32_le nextCmapOffset;
};
struct CWDH {
u8 magic[4];
u32_le sectionSize;
u16_le startIndex;
u16_le endIndex;
u32_le nextCwdhOffset;
};
// Relocates the internal addresses of the BCFNT Shared Font to the new base. The current base will
// be auto-detected based on the file headers.
void relocateSharedFont(u8* sharedFont, u32 newAddress);
} // namespace HLE::Fonts

View file

@ -1,5 +1,5 @@
# Panda3DS
[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)
[![Windows Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Windows_Build.yml) [![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml) [![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml) [![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml) [![AUR Package](https://img.shields.io/aur/version/panda3ds-git)](https://aur.archlinux.org/packages/panda3ds-git)
Panda3DS is an HLE, red-panda-themed Nintendo 3DS emulator written in C++ which started out as a fun project out of curiosity, but evolved into something that can sort of play games!
@ -10,7 +10,7 @@ Join our Discord server by pressing on the banner below, or find us on other pla
[![Discord Banner 2](https://discord.com/api/guilds/1118695732958994532/widget.png?style=banner2)](https://discord.gg/ZYbugsEmsw)
![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png)
![screenshot1](docs/img/KirbyRobobot.png) ![screenshot2](docs/img/OoT_Title.png) ![screenshot3](docs/img/pokegang.png) ![screenshot4](docs/img/KirbyAndroid.png)
# Download
You can download stable builds from the Releases tab, or you can download the latest build from the tables below. Additionally, Panda3DS comes in 2 flavours on PC: A minimal SDL frontend, which does not have a GUI, and an experimental Qt 6 frontend with a proper user interface.
@ -22,16 +22,16 @@ SDL builds (No GUI):
|MacOS build|[![MacOS Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/MacOS_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/MacOS_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
|Linux build|[![Linux Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml/badge.svg?branch=master)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Linux_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Linux_AppImage_Build/master/Linux%20executable.zip)|
Qt builds:
Qt and Android builds:
|Platform|Status|Download|
|--------|------------|--------|
|Windows build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Windows Executable](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Windows%20executable.zip)|
|MacOS build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[MacOS App Bundle](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/MacOS%20Alber%20App%20Bundle.zip)|
|Linux build|[![Qt Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Qt_Build.yml)|[Linux AppImage](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Qt_Build/master/Linux%20executable.zip)|
|Android build (arm64)|[![Android Build](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml/badge.svg)](https://github.com/wheremyfoodat/Panda3DS/actions/workflows/Android_Build.yml)|[Android APK](https://nightly.link/wheremyfoodat/Panda3DS/workflows/Android_Build/master/Android%20APKs%20(arm64).zip)|
# Compatibility
Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is not supported, and some QoL features (including a GUI) are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!
Panda3DS is still in the early stages of development. Many games boot, many don't. Lots of games have at least some hilariously broken graphics, audio is WIP, and some QoL features are missing. However, even more things are implemented, such as most of the 3DS core required to play games, and various neat features, such as Lua scripting, discord bot support, support for some system apps, cheats, controller support, WIP amiibo support and many more! The emulator is constantly evolving, so make sure to take a peek every now and then!
For documenting game compatibility, make sure to visit the [games list repository](https://github.com/Panda3DS-emu/Panda3DS-Games-List). For miscellaneous issues or more technical issues, feel free to use this repo's issues tab.
# Why?
@ -116,7 +116,7 @@ Panda3DS also supports controller input using the SDL2 GameController API.
- [MelonDS](https://github.com/melonDS-emu/melonDS): "DS emulator, sorta" - Arisotura
- [Kaizen](https://github.com/SimoneN64/Kaizen): Experimental work-in-progress low-level N64 emulator
- [ChonkyStation](https://github.com/liuk7071/ChonkyStation): Work-in-progress PlayStation emulator
- [shadPS4](https://github.com/georgemoralis/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
- [shadPS4](https://github.com/shadps4-emu/shadPS4): Work-in-progress PS4 emulator by the founder of PCSX, PCSX2 and more
- [Hydra](https://github.com/hydra-emu/hydra): Cross-platform GameBoy, NES, N64 and Chip-8 emulator
# Support

View file

@ -67,6 +67,7 @@ void EmulatorConfig::load() {
vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);
forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
@ -79,7 +80,7 @@ void EmulatorConfig::load() {
if (audioResult.is_ok()) {
auto audio = audioResult.unwrap();
auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "Null");
auto dspCoreName = toml::find_or<std::string>(audio, "DSPEmulation", "HLE");
dspType = Audio::DSPCore::typeFromString(dspCoreName);
audioEnabled = toml::find_or<toml::boolean>(audio, "EnableAudio", false);
}
@ -141,6 +142,7 @@ void EmulatorConfig::save() {
data["GPU"]["UseUbershaders"] = useUbershaders;
data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
data["GPU"]["AccelerateShaders"] = accelerateShaders;
data["GPU"]["EnableRenderdoc"] = enableRenderdoc;
data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));

View file

@ -0,0 +1,137 @@
#include "PICA/draw_acceleration.hpp"
#include <bit>
#include <tuple>
#include "PICA/gpu.hpp"
#include "PICA/pica_simd.hpp"
#include "PICA/regs.hpp"
void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
accel.indexed = indexed;
accel.totalAttribCount = totalAttribCount;
accel.enabledAttributeMask = 0;
const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer
if (indexed) {
u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
u16 minimumIndex = std::numeric_limits<u16>::max();
u16 maximumIndex = 0;
// Check whether the index buffer uses u16 indices or u8
accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
if (accel.useShortIndices) {
std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<true>(indexBuffer, vertexCount);
} else {
std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<false>(indexBuffer, vertexCount);
}
accel.indexBuffer = indexBuffer;
} else {
accel.indexBuffer = nullptr;
accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
}
const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
const u64 inputAttrCfg = getVertexShaderInputConfig();
u32 attrCount = 0;
u32 loaderOffset = 0;
accel.vertexDataSize = 0;
accel.totalLoaderCount = 0;
for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
auto& loaderData = attributeInfo[i]; // Get information for this attribute loader
// This loader is empty, skip it
if (loaderData.componentCount == 0 || loaderData.size == 0) {
continue;
}
auto& loader = accel.loaders[accel.totalLoaderCount++];
// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
// Which is equal to maximumIndex - minimumIndex + 1
const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
loader.size = bytes;
// Add it to the total vertex data size, aligned to 4 bytes.
accel.vertexDataSize += (bytes + 3) & ~3;
// Get a pointer to the data where this loader's data is stored
const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
loader.data = getPointerPhys<u8>(loaderAddress);
u64 attrCfg = loaderData.getConfigFull(); // Get config1 | (config2 << 32)
u32 attributeOffset = 0;
for (int component = 0; component < loaderData.componentCount; component++) {
uint attributeIndex = (attrCfg >> (component * 4)) & 0xf; // Get index of attribute in vertexCfg
// Vertex attributes used as padding
// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
if (attributeIndex >= 12) [[unlikely]] {
// Align attribute address up to a 4 byte boundary
attributeOffset = (attributeOffset + 3) & -4;
attributeOffset += (attributeIndex - 11) << 2;
continue;
}
const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float)
const u32 size = (attribInfo >> 2) + 1; // Total number of components
// Size of each component based on the attribute type
static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
// Mark the attribute as enabled
accel.enabledAttributeMask |= 1 << inputReg;
auto& attr = accel.attributeInfo[inputReg];
attr.componentCount = size;
attr.offset = attributeOffset + loaderOffset;
attr.stride = loaderData.size;
attr.type = attribType;
attributeOffset += size * sizePerComponent[attribType];
}
loaderOffset += loader.size;
}
u32 fixedAttributes = fixedAttribMask;
accel.fixedAttributes = 0;
// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
while (fixedAttributes != 0) {
// Get index of next fixed attribute and turn it off
const u32 index = std::countr_zero<u32>(fixedAttributes);
const u32 mask = 1u << index;
fixedAttributes ^= mask;
// PICA register this fixed attribute is meant to go to
const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
const u32 inputRegMask = 1u << inputReg;
// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
if ((accel.enabledAttributeMask & inputRegMask) == 0) {
vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
auto& attr = accel.attributeInfo[inputReg];
accel.fixedAttributes |= inputRegMask;
for (int i = 0; i < 4; i++) {
attr.fixedValue[i] = fixedAttr[i].toFloat32();
}
}
}
accel.canBeAccelerated = true;
}

View file

@ -126,37 +126,62 @@ void GPU::reset() {
externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
externalRegs[Framebuffer1Select] = 0;
renderer->setUbershaderSetting(config.useUbershaders);
renderer->reset();
}
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
// And whether we are going to use the shader JIT (second template parameter)
void GPU::drawArrays(bool indexed) {
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
if (indexed) {
if (shaderJITEnabled)
drawArrays<true, true>();
else
drawArrays<true, false>();
} else {
if (shaderJITEnabled)
drawArrays<false, true>();
else
drawArrays<false, false>();
}
}
static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
template <bool indexed, bool useShaderJIT>
void GPU::drawArrays() {
if constexpr (useShaderJIT) {
shaderJIT.prepare(shaderUnit.vs);
// Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
// And whether we are going to use the shader JIT (second template parameter)
void GPU::drawArrays(bool indexed) {
PICA::DrawAcceleration accel;
if (config.accelerateShaders) {
// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on
getAcceleratedDrawInfo(accel, indexed);
}
setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
if (hwShaders) {
// Hardware shaders have their own accelerated code path for draws, so they skip everything here
const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
// Total # of vertices to render
const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
} else {
const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
if (indexed) {
if (shaderJITEnabled) {
drawArrays<true, ShaderExecMode::JIT>();
} else {
drawArrays<true, ShaderExecMode::Interpreter>();
}
} else {
if (shaderJITEnabled) {
drawArrays<false, ShaderExecMode::JIT>();
} else {
drawArrays<false, ShaderExecMode::Interpreter>();
}
}
}
}
template <bool indexed, ShaderExecMode mode>
void GPU::drawArrays() {
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.prepare(shaderUnit.vs);
} else if constexpr (mode == ShaderExecMode::Hardware) {
// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
}
// We can have up to 16 attributes, each one consisting of 4 floats
constexpr u32 maxAttrSizeInFloats = 16 * 4;
// Base address for vertex attributes
// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@ -321,8 +346,6 @@ void GPU::drawArrays() {
}
// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
// Corgi does this although I'm not sure if it's actually needed for anything.
// TODO: Find out
while (component < 4) {
attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
component++;
@ -336,13 +359,13 @@ void GPU::drawArrays() {
// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
// Based on the SH_ATTRIBUTES_PERMUTATION registers.
// Ie it might attribute #0 to v2, #1 to v7, etc
// Ie it might map attribute #0 to v2, #1 to v7, etc
for (int j = 0; j < totalAttribCount; j++) {
const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
}
if constexpr (useShaderJIT) {
if constexpr (mode == ShaderExecMode::JIT) {
shaderJIT.run(shaderUnit.vs);
} else {
shaderUnit.vs.run();

View file

@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
// If we've reached 3 verts, issue a draw call
// Handle rendering depending on the primitive type
if (immediateModeVertIndex == 3) {
renderer->prepareForDraw(shaderUnit, nullptr);
renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
switch (primType) {
@ -300,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
}
case VertexBoolUniform: {
shaderUnit.vs.boolUniform = value & 0xffff;
shaderUnit.vs.uploadBoolUniform(value & 0xffff);
break;
}

View file

@ -1,5 +1,10 @@
#include "PICA/shader_decompiler.hpp"
#include <fmt/format.h>
#include <array>
#include <cassert>
#include "config.hpp"
using namespace PICA;
@ -13,11 +18,45 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
analysisFailed = false;
const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
if (function == nullptr) {
if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
analysisFailed = true;
}
}
// Helpers for merging parallel/series exit methods from Citra
// Merges exit method of two parallel branches.
static ExitMode exitParallel(ExitMode a, ExitMode b) {
if (a == ExitMode::Unknown) {
return b;
}
else if (b == ExitMode::Unknown) {
return a;
}
else if (a == b) {
return a;
}
return ExitMode::Conditional;
}
// Cascades exit method of two blocks of code.
static ExitMode exitSeries(ExitMode a, ExitMode b) {
assert(a != ExitMode::AlwaysEnd);
if (a == ExitMode::Unknown) {
return ExitMode::Unknown;
}
if (a == ExitMode::AlwaysReturn) {
return b;
}
if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
return ExitMode::AlwaysEnd;
}
return ExitMode::Conditional;
}
ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
// Initialize exit mode to unknown by default, in order to detect things like unending loops
auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
@ -32,25 +71,132 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
const u32 opcode = instruction >> 26;
switch (opcode) {
case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
case ShaderOpcodes::JMPC:
case ShaderOpcodes::JMPU: {
const u32 dest = getBits<10, 12>(instruction);
// Register this jump address to our outLabels set
labels.insert(dest);
// This opens up 2 parallel paths of execution
auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
it->second = exitParallel(branchTakenExit, branchNotTakenExit);
return it->second;
}
case ShaderOpcodes::IFU:
case ShaderOpcodes::IFC: {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
// Check if analysis of the branch taken func failed and return unknown if it did
if (analysisFailed) {
it->second = ExitMode::Unknown;
return it->second;
}
// Next analyze the not taken func
ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
if (num != 0) {
const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
// Check if analysis failed and return unknown if it did
if (analysisFailed) {
it->second = ExitMode::Unknown;
return it->second;
}
branchNotTakenExitMode = branchNotTakenFunc->exitMode;
}
auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
// Both branches of the if/else end, so there's nothing after the call
if (parallel == ExitMode::AlwaysEnd) {
it->second = parallel;
return it->second;
} else {
ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
it->second = conditionalExitMode;
return it->second;
}
break;
}
case ShaderOpcodes::CALL: {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const Function* calledFunction = addFunction(shader, dest, dest + num);
// Check if analysis of the branch taken func failed and return unknown if it did
if (analysisFailed) {
it->second = ExitMode::Unknown;
return it->second;
}
if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
it->second = ExitMode::AlwaysEnd;
return it->second;
}
// Exit mode of the remainder of this function, after we return from the callee
const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
it->second = exitMode;
return exitMode;
}
case ShaderOpcodes::CALLC:
case ShaderOpcodes::CALLU: {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const Function* calledFunction = addFunction(shader, dest, dest + num);
// Check if analysis of the branch taken func failed and return unknown if it did
if (analysisFailed) {
it->second = ExitMode::Unknown;
return it->second;
}
// Exit mode of the remainder of this function, after we return from the callee
const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
it->second = exitMode;
return exitMode;
}
case ShaderOpcodes::LOOP: {
u32 dest = getBits<10, 12>(instruction);
const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
if (analysisFailed) {
it->second = ExitMode::Unknown;
return it->second;
}
if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
it->second = ExitMode::AlwaysEnd;
return it->second;
}
const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
it->second = exitMode;
return it->second;
}
case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
default: break;
}
}
// A function without control flow instructions will always reach its "return point" and return
return ExitMode::AlwaysReturn;
it->second = ExitMode::AlwaysReturn;
return it->second;
}
void ShaderDecompiler::compileRange(const AddressRange& range) {
std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
u32 pc = range.start;
const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
bool finished = false;
@ -58,6 +204,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
while (pc < end && !finished) {
compileInstruction(pc, finished);
}
return std::make_pair(pc, finished);
}
const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
@ -71,20 +219,43 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
}
void ShaderDecompiler::writeAttributes() {
// Annoyingly, GLES does not support having an array as an input attribute, so declare each attribute separately for now
decompiledShader += R"(
layout(location = 0) in vec4 inputs[8];
layout(location = 0) in vec4 attr0;
layout(location = 1) in vec4 attr1;
layout(location = 2) in vec4 attr2;
layout(location = 3) in vec4 attr3;
layout(location = 4) in vec4 attr4;
layout(location = 5) in vec4 attr5;
layout(location = 6) in vec4 attr6;
layout(location = 7) in vec4 attr7;
layout(location = 8) in vec4 attr8;
layout(location = 9) in vec4 attr9;
layout(location = 10) in vec4 attr10;
layout(location = 11) in vec4 attr11;
layout(location = 12) in vec4 attr12;
layout(location = 13) in vec4 attr13;
layout(location = 14) in vec4 attr14;
layout(location = 15) in vec4 attr15;
layout(std140) uniform PICAShaderUniforms {
vec4 uniform_float[96];
uvec4 uniform_int;
uint uniform_bool;
};
vec4 temp_registers[16];
vec4 dummy_vec = vec4(0.0);
layout(std140) uniform PICAShaderUniforms {
vec4 uniform_f[96];
uvec4 uniform_i;
uint uniform_bool;
};
vec4 temp[16];
vec4 out_regs[16];
vec4 dummy_vec = vec4(0.0);
ivec3 addr_reg = ivec3(0);
bvec2 cmp_reg = bvec2(false);
vec4 uniform_indexed(int source, int offset) {
int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
uint index = uint(clipped_offs + source) & 127u;
return (index < 96u) ? uniform_f[index] : vec4(1.0);
}
)";
decompiledShader += "\n";
}
std::string ShaderDecompiler::decompile() {
@ -94,11 +265,14 @@ std::string ShaderDecompiler::decompile() {
return "";
}
decompiledShader = "";
compilationError = false;
decompiledShader.clear();
// Reserve some memory for the shader string to avoid memory allocations
decompiledShader.reserve(256 * 1024);
switch (api) {
case API::GL: decompiledShader += "#version 410 core\n"; break;
case API::GLES: decompiledShader += "#version 300 es\n"; break;
case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
default: break;
}
@ -109,7 +283,7 @@ std::string ShaderDecompiler::decompile() {
decompiledShader += R"(
vec4 safe_mul(vec4 a, vec4 b) {
vec4 res = a * b;
return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
}
)";
}
@ -121,17 +295,61 @@ std::string ShaderDecompiler::decompile() {
decompiledShader += "void pica_shader_main() {\n";
AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
callFunction(*findFunction(mainFunctionRange));
decompiledShader += "}\n";
auto mainFunc = findFunction(mainFunctionRange);
for (auto& func : controlFlow.functions) {
if (func.outLabels.size() > 0) {
Helpers::panic("Function with out labels");
decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
for (const Function& func : controlFlow.functions) {
if (func.outLabels.empty()) {
decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
if (!finished) {
decompiledShader += "return false;";
}
decompiledShader += "}\n";
} else {
auto labels = func.outLabels;
labels.insert(func.start);
// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
// current PC
decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
decompiledShader += "while(true){\nswitch(pc){\n";
for (u32 label : labels) {
decompiledShader += fmt::format("case {}u: {{", label);
// Fetch the next label whose address > label
auto it = labels.lower_bound(label + 1);
u32 next = (it == labels.end()) ? func.end : *it;
auto [endPC, finished] = compileRange(AddressRange(label, next));
if (endPC > next && !finished) {
labels.insert(endPC);
decompiledShader += fmt::format("pc = {}u; break;", endPC);
}
// Fallthrough to next label
decompiledShader += "}\n";
}
decompiledShader += "default: return false;\n";
// Exit the switch and loop
decompiledShader += "} }\n";
// Exit the function
decompiledShader += "return false;\n";
decompiledShader += "}\n";
}
}
decompiledShader += "void " + func.getIdentifier() + "() {\n";
compileRange(AddressRange(func.start, func.end));
decompiledShader += "}\n";
// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
// and the renderer core will decide to use CPU shaders instead
if (compilationError) [[unlikely]] {
return "";
}
return decompiledShader;
@ -139,30 +357,41 @@ std::string ShaderDecompiler::decompile() {
std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
if (source < 0x10) {
return "inputs[" + std::to_string(source) + "]";
return "attr" + std::to_string(source);
} else if (source < 0x20) {
return "temp_registers[" + std::to_string(source - 0x10) + "]";
return "temp[" + std::to_string(source - 0x10) + "]";
} else {
const usize floatIndex = (source - 0x20) & 0x7f;
if (floatIndex >= 96) [[unlikely]] {
return "dummy_vec";
if (index == 0) {
if (floatIndex >= 96) [[unlikely]] {
return "dummy_vec";
}
return "uniform_f[" + std::to_string(floatIndex) + "]";
} else {
static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
}
return "uniform_float[" + std::to_string(floatIndex) + "]";
}
}
std::string ShaderDecompiler::getDest(u32 dest) const {
if (dest < 0x10) {
return "output_registers[" + std::to_string(dest) + "]";
return "out_regs[" + std::to_string(dest) + "]";
} else if (dest < 0x20) {
return "temp_registers[" + std::to_string(dest - 0x10) + "]";
return "temp[" + std::to_string(dest - 0x10) + "]";
} else {
return "dummy_vec";
}
}
std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
static constexpr uint noSwizzle = 0x1B;
if (swizzle == noSwizzle) {
return "";
}
static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
std::string ret(". ");
@ -176,7 +405,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
std::string ret = ".";
if (destinationMask & 0b1000) {
ret += "x";
}
@ -208,11 +436,12 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
return;
}
decompiledShader += dest + destSwizzle + " = ";
if (writtenLaneCount == 1) {
decompiledShader += "float(" + value + ");\n";
} else {
decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
// Don't write destination swizzle if all lanes are getting written to
decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
if (writtenLaneCount <= 3) {
decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
} else if (writtenLaneCount == 4) {
decompiledShader += fmt::format("{};\n", value);
}
}
@ -246,26 +475,101 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
std::string dest = getDest(destIndex);
if (idx != 0) {
Helpers::panic("GLSL recompiler: Indexed instruction");
}
if (invertSources) {
Helpers::panic("GLSL recompiler: Inverted instruction");
}
switch (opcode) {
case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
case ShaderOpcodes::MUL:
if (!config.accurateShaderMul) {
setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
} else {
setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
}
break;
case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;
case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
case ShaderOpcodes::DP3:
if (!config.accurateShaderMul) {
setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
} else {
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
}
break;
case ShaderOpcodes::DP4:
if (!config.accurateShaderMul) {
setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
} else {
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
}
break;
case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;
default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
case ShaderOpcodes::SLT:
case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
case ShaderOpcodes::SGE:
case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
case ShaderOpcodes::DPH:
case ShaderOpcodes::DPHI:
if (!config.accurateShaderMul) {
setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
} else {
// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
}
break;
case ShaderOpcodes::CMP1:
case ShaderOpcodes::CMP2: {
static constexpr std::array<const char*, 8> operators = {
// The last 2 operators always return true and are handled specially
"==", "!=", "<", "<=", ">", ">=", "", "",
};
const u32 cmpY = getBits<21, 3>(instruction);
const u32 cmpX = getBits<24, 3>(instruction);
// Compare x first
if (cmpX >= 6) {
decompiledShader += "cmp_reg.x = true;\n";
} else {
decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
}
// Then compare Y
if (cmpY >= 6) {
decompiledShader += "cmp_reg.y = true;\n";
} else {
decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
}
break;
}
case ShaderOpcodes::MOVA: {
const bool writeX = getBit<3>(operandDescriptor); // Should we write the x component of the address register?
const bool writeY = getBit<2>(operandDescriptor);
if (writeX && writeY) {
decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
} else if (writeX) {
decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
} else if (writeY) {
decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
}
break;
}
default:
Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
compilationError = true;
break;
}
} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
@ -299,23 +603,156 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
src3 += getSwizzlePattern(swizzle3);
std::string dest = getDest(destIndex);
if (idx != 0) {
Helpers::panic("GLSL recompiler: Indexed instruction");
if (!config.accurateShaderMul) {
setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
} else {
setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
}
setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
} else {
switch (opcode) {
case ShaderOpcodes::END: finished = true; return;
default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
case ShaderOpcodes::JMPC: {
const u32 dest = getBits<10, 12>(instruction);
const u32 condOp = getBits<22, 2>(instruction);
const uint refY = getBit<24>(instruction);
const uint refX = getBit<25>(instruction);
const char* condition = getCondition(condOp, refX, refY);
decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
break;
}
case ShaderOpcodes::JMPU: {
const u32 dest = getBits<10, 12>(instruction);
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
const u32 mask = 1u << bit;
const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we jump if bit = 1, otherwise 0
decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
break;
}
case ShaderOpcodes::IFU:
case ShaderOpcodes::IFC: {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
if (opcode == ShaderOpcodes::IFC) {
const u32 condOp = getBits<22, 2>(instruction);
const uint refY = getBit<24>(instruction);
const uint refX = getBit<25>(instruction);
const char* condition = getCondition(condOp, refX, refY);
decompiledShader += fmt::format("if ({}) {{", condition);
} else {
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
const u32 mask = 1u << bit;
decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
}
callFunction(*conditionalFunc);
decompiledShader += "}\n";
pc = dest;
if (num > 0) {
const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
pc = dest + num;
decompiledShader += "else { ";
callFunction(*elseFunc);
decompiledShader += "}\n";
if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
finished = true;
return;
}
}
return;
}
case ShaderOpcodes::CALL:
case ShaderOpcodes::CALLC:
case ShaderOpcodes::CALLU: {
const u32 num = instruction & 0xff;
const u32 dest = getBits<10, 12>(instruction);
const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
// Handle conditions for CALLC/CALLU
if (opcode == ShaderOpcodes::CALLC) {
const u32 condOp = getBits<22, 2>(instruction);
const uint refY = getBit<24>(instruction);
const uint refX = getBit<25>(instruction);
const char* condition = getCondition(condOp, refX, refY);
decompiledShader += fmt::format("if ({}) {{", condition);
} else if (opcode == ShaderOpcodes::CALLU) {
const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check
const u32 mask = 1u << bit;
decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
}
callFunction(*calledFunc);
// Close brackets for CALLC/CALLU
if (opcode != ShaderOpcodes::CALL) {
decompiledShader += "}";
}
if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
finished = true;
return;
}
break;
}
case ShaderOpcodes::LOOP: {
const u32 dest = getBits<10, 12>(instruction);
const u32 uniformIndex = getBits<22, 2>(instruction);
// loop counter = uniform.y
decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
decompiledShader += fmt::format(
"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
"16u) & 0xFFu)) {{\n",
pc, pc, uniformIndex, pc, uniformIndex
);
AddressRange range(pc + 1, dest + 1);
const Function* func = findFunction(range);
callFunction(*func);
decompiledShader += "}\n";
// Jump to the end of the loop. We don't want to compile the code inside the loop again.
// This will be incremented by 1 due to the pc++ at the end of this loop.
pc = dest;
if (func->exitMode == ExitMode::AlwaysEnd) {
finished = true;
return;
}
break;
}
case ShaderOpcodes::END:
decompiledShader += "return true;\n";
finished = true;
return;
case ShaderOpcodes::NOP: break;
default:
Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
compilationError = true;
break;
}
}
pc++;
}
bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
const u32 opcode = instruction >> 26;
switch (opcode) {
@ -339,16 +776,57 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
case ShaderOpcodes::SLT:
case ShaderOpcodes::SLTI:
case ShaderOpcodes::SGE:
case ShaderOpcodes::SGEI: return true;
case ShaderOpcodes::SGEI:
case ShaderOpcodes::LITP: return true;
default: return false;
}
}
void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
void ShaderDecompiler::callFunction(const Function& function) {
switch (function.exitMode) {
// This function always ends, so call it and return true to signal that we're gonna be ending the shader
case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
// This function will not end. Just call it like a normal function.
default: decompiledShader += function.getCallStatement() + ";\n"; break;
}
}
std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
ShaderDecompiler decompiler(shader, config, entrypoint, api, language);
return decompiler.decompile();
}
const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
static constexpr std::array<const char*, 16> conditions = {
// ref(Y, X) = (0, 0)
"!all(cmp_reg)",
"all(not(cmp_reg))",
"!cmp_reg.x",
"!cmp_reg.y",
// ref(Y, X) = (0, 1)
"cmp_reg.x || !cmp_reg.y",
"cmp_reg.x && !cmp_reg.y",
"cmp_reg.x",
"!cmp_reg.y",
// ref(Y, X) = (1, 0)
"!cmp_reg.x || cmp_reg.y",
"!cmp_reg.x && cmp_reg.y",
"!cmp_reg.x",
"cmp_reg.y",
// ref(Y, X) = (1, 1)
"any(cmp_reg)",
"all(cmp_reg)",
"cmp_reg.x",
"cmp_reg.y",
};
const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
return conditions[key];
}

View file

@ -1,6 +1,14 @@
#include <fmt/format.h>
#include <utility>
#include "PICA/pica_frag_config.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_gen.hpp"
// We can include the driver headers here since they shouldn't have any actual API-specific code
#include "renderer_gl/gl_driver.hpp"
using namespace PICA;
using namespace PICA::ShaderGen;
@ -34,6 +42,8 @@ static constexpr const char* uniformDefinition = R"(
std::string FragmentGenerator::getDefaultVertexShader() {
std::string ret = "";
// Reserve some space (128KB) in the output string to avoid too many allocations later
ret.reserve(128 * 1024);
switch (api) {
case API::GL: ret += "#version 410 core"; break;
@ -94,7 +104,7 @@ std::string FragmentGenerator::getDefaultVertexShader() {
return ret;
}
std::string FragmentGenerator::generate(const FragmentConfig& config) {
std::string FragmentGenerator::generate(const FragmentConfig& config, void* driverInfo) {
std::string ret = "";
switch (api) {
@ -103,6 +113,27 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
default: break;
}
// For GLES we need to enable & use the framebuffer fetch extension in order to emulate logic ops
bool emitLogicOps = api == API::GLES && config.outConfig.logicOpMode != PICA::LogicOpMode::Copy && driverInfo != nullptr;
if (emitLogicOps) {
auto driver = static_cast<OpenGL::Driver*>(driverInfo);
// If the driver does not support framebuffer fetch at all, don't emit logic op code
if (!driver->supportFbFetch()) {
emitLogicOps = false;
}
// Figure out which fb fetch extension we have and enable it
else {
if (driver->supportsExtFbFetch) {
ret += "\n#extension GL_EXT_shader_framebuffer_fetch : enable\n#define fb_color fragColor\n";
} else if (driver->supportsArmFbFetch) {
ret += "\n#extension GL_ARM_shader_framebuffer_fetch : enable\n#define fb_color gl_LastFragColorARM[0]\n";
}
}
}
bool unimplementedFlag = false;
if (api == API::GLES) {
ret += R"(
@ -192,10 +223,13 @@ std::string FragmentGenerator::generate(const FragmentConfig& config) {
}
compileFog(ret, config);
applyAlphaTest(ret, config);
ret += "fragColor = combinerOutput;\n}"; // End of main function
if (!emitLogicOps) {
ret += "fragColor = combinerOutput;\n}"; // End of main function
} else {
compileLogicOps(ret, config);
}
return ret;
}
@ -671,3 +705,135 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);";
shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
}
std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
// First, calculate output register -> Fixed function fragment semantics based on the VAO config
// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
std::array<std::pair<int, int>, 32> outputMappings{};
// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
std::array<u8, 16> vsOutputRegisters;
{
uint count = 0;
u16 outputMask = vertConfig.outputMask;
// See which registers are actually enabled and ignore the disabled ones
for (int i = 0; i < 16; i++) {
if (outputMask & 1) {
vsOutputRegisters[count++] = i;
}
outputMask >>= 1;
}
// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
for (; count < 16; count++) {
vsOutputRegisters[count] = count;
}
for (int i = 0; i < vertConfig.outputCount; i++) {
const u32 config = vertConfig.outmaps[i];
for (int j = 0; j < 4; j++) {
const u32 mapping = (config >> (j * 8)) & 0x1F;
outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
}
}
}
auto getSemanticName = [&](u32 semanticIndex) {
auto [reg, lane] = outputMappings[semanticIndex];
return fmt::format("out_regs[{}][{}]", reg, lane);
};
std::string semantics = fmt::format(
R"(
vec4 a_coords = vec4({}, {}, {}, {});
vec4 a_quaternion = vec4({}, {}, {}, {});
vec4 a_vertexColour = vec4({}, {}, {}, {});
vec2 a_texcoord0 = vec2({}, {});
float a_texcoord0_w = {};
vec2 a_texcoord1 = vec2({}, {});
vec2 a_texcoord2 = vec2({}, {});
vec3 a_view = vec3({}, {}, {});
)",
getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
getSemanticName(18), getSemanticName(19), getSemanticName(20)
);
if (usingUbershader) {
Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
return picaSource;
} else {
// TODO: Uniforms and don't hardcode fixed-function semantic indices...
std::string ret = picaSource;
if (api == API::GLES) {
ret += "\n#define USING_GLES\n";
}
ret += uniformDefinition;
ret += R"(
out vec4 v_quaternion;
out vec4 v_colour;
out vec3 v_texcoord0;
out vec2 v_texcoord1;
out vec3 v_view;
out vec2 v_texcoord2;
#ifndef USING_GLES
out float gl_ClipDistance[2];
#endif
void main() {
pica_shader_main();
)";
// Transfer fixed function fragment registers from vertex shader output to the fragment shader
ret += semantics;
ret += R"(
gl_Position = a_coords;
vec4 colourAbs = abs(a_vertexColour);
v_colour = min(colourAbs, vec4(1.f));
v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
v_view = a_view;
v_quaternion = a_quaternion;
#ifndef USING_GLES
gl_ClipDistance[0] = -a_coords.z;
gl_ClipDistance[1] = dot(clipCoords, a_coords);
#endif
})";
return ret;
}
}
void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
if (api != API::GLES) [[unlikely]] {
Helpers::warn("Shadergen: Unsupported API for compileLogicOps");
shader += "fragColor = combinerOutput;\n}"; // End of main function
return;
}
shader += "fragColor = ";
switch (config.outConfig.logicOpMode) {
case PICA::LogicOpMode::Copy: shader += "combinerOutput"; break;
case PICA::LogicOpMode::Nop: shader += "fb_color"; break;
case PICA::LogicOpMode::Clear: shader += "vec4(0.0)"; break;
case PICA::LogicOpMode::Set: shader += "vec4(1.0)"; break;
case PICA::LogicOpMode::InvertedCopy: shader += "vec4(uvec4(combinerOutput * 255.0) ^ uvec4(0xFFu)) * (1.0 / 255.0)"; break;
default:
shader += "combinerOutput";
Helpers::warn("Shadergen: Unimplemented logic op mode");
break;
}
shader += ";\n}"; // End of main function
}

View file

@ -34,4 +34,5 @@ void PICAShader::reset() {
codeHashDirty = true;
opdescHashDirty = true;
uniformsDirty = true;
}

View file

@ -76,6 +76,7 @@ namespace Audio {
source.reset();
}
mixer.reset();
// Note: Reset audio pipe AFTER resetting all pipes, otherwise the new data will be yeeted
resetAudioPipe();
}
@ -250,6 +251,8 @@ namespace Audio {
source.isBufferIDDirty = false;
}
performMix(read, write);
}
void HLE_DSP::updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients) {
@ -465,6 +468,50 @@ namespace Audio {
}
}
void HLE_DSP::performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion) {
updateMixerConfig(readRegion);
// TODO: Do the actual audio mixing
auto& dspStatus = writeRegion.dspStatus;
// Stub the DSP status. It's unknown what the "unknown" field is but Citra sets it to 0, so we do too to be safe
dspStatus.droppedFrames = 0;
dspStatus.unknown = 0;
}
void HLE_DSP::updateMixerConfig(Audio::HLE::SharedMemory& sharedMem) {
auto& config = sharedMem.dspConfiguration;
// No configs have been changed, so there's nothing to update
if (config.dirtyRaw == 0) {
return;
}
if (config.outputFormatDirty) {
mixer.channelFormat = config.outputFormat;
}
if (config.masterVolumeDirty) {
mixer.volumes[0] = config.masterVolume;
}
if (config.auxVolume0Dirty) {
mixer.volumes[1] = config.auxVolumes[0];
}
if (config.auxVolume1Dirty) {
mixer.volumes[2] = config.auxVolumes[1];
}
if (config.auxBusEnable0Dirty) {
mixer.enableAuxStages[0] = config.auxBusEnable[0] != 0;
}
if (config.auxBusEnable1Dirty) {
mixer.enableAuxStages[1] = config.auxBusEnable[1] != 0;
}
config.dirtyRaw = 0;
}
HLE_DSP::SampleBuffer HLE_DSP::decodePCM8(const u8* data, usize sampleCount, Source& source) {
SampleBuffer decodedSamples(sampleCount);
@ -585,7 +632,7 @@ namespace Audio {
AAC::Message response;
switch (request.command) {
case AAC::Command::EncodeDecode:
case AAC::Command::EncodeDecode: {
// Dummy response to stop games from hanging
response.resultCode = AAC::ResultCode::Success;
response.decodeResponse.channelCount = 2;
@ -596,10 +643,13 @@ namespace Audio {
response.command = request.command;
response.mode = request.mode;
// We've already got an AAC decoder but it's currently disabled until mixing & output is properly implemented
// TODO: Uncomment this when the time comes
// aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
// TODO: Make this a toggle in config.toml. Currently we have it off by default until we finish the DSP mixer.
constexpr bool enableAAC = false;
if (enableAAC) {
aacDecoder->decode(response, request, [this](u32 paddr) { return getPointerPhys<u8>(paddr); });
}
break;
}
case AAC::Command::Init:
case AAC::Command::Shutdown:

View file

@ -136,7 +136,7 @@ void Kernel::mapMemoryBlock() {
break;
case KernelHandles::FontSharedMemHandle:
mem.copySharedFont(ptr);
mem.copySharedFont(ptr, addr);
break;
case KernelHandles::CSNDSharedMemHandle:

View file

@ -7,6 +7,7 @@
#include "config_mem.hpp"
#include "resource_limits.hpp"
#include "services/fonts.hpp"
#include "services/ptm.hpp"
CMRC_DECLARE(ConsoleFonts);
@ -51,7 +52,7 @@ void Memory::reset() {
if (e.handle == KernelHandles::FontSharedMemHandle) {
// Read font size from the cmrc filesystem the font is stored in
auto fonts = cmrc::ConsoleFonts::get_filesystem();
e.size = fonts.open("CitraSharedFontUSRelocated.bin").size();
e.size = fonts.open("SharedFontReplacement.bin").size();
}
e.mapped = false;
@ -520,10 +521,13 @@ Regions Memory::getConsoleRegion() {
return region;
}
void Memory::copySharedFont(u8* pointer) {
void Memory::copySharedFont(u8* pointer, u32 vaddr) {
auto fonts = cmrc::ConsoleFonts::get_filesystem();
auto font = fonts.open("CitraSharedFontUSRelocated.bin");
auto font = fonts.open("SharedFontReplacement.bin");
std::memcpy(pointer, font.begin(), font.size());
// Relocate shared font to the address it's being loaded to
HLE::Fonts::relocateSharedFont(pointer, vaddr);
}
std::optional<u64> Memory::getProgramID() {

View file

@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
}
void GLStateManager::resetBuffers() {
boundVBO = 0;
boundUBO = 0;
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_UNIFORM_BUFFER, 0);
}

View file

@ -2,13 +2,16 @@
#include <stb_image_write.h>
#include <bit>
#include <cmrc/cmrc.hpp>
#include "config.hpp"
#include "PICA/float_types.hpp"
#include "PICA/pica_frag_uniforms.hpp"
#include "PICA/gpu.hpp"
#include "PICA/pica_frag_uniforms.hpp"
#include "PICA/pica_simd.hpp"
#include "PICA/regs.hpp"
#include "PICA/shader_decompiler.hpp"
#include "config.hpp"
#include "math_util.hpp"
CMRC_DECLARE(RendererGL);
@ -24,7 +27,7 @@ void RendererGL::reset() {
colourBufferCache.reset();
textureCache.reset();
clearShaderCache();
shaderCache.clear();
// Init the colour/depth buffer settings to some random defaults on reset
colourBufferLoc = 0;
@ -77,40 +80,56 @@ void RendererGL::initGraphicsContextInternal() {
gl.useProgram(displayProgram);
glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object
// Create stream buffers for vertex, index and uniform buffers
static constexpr usize hwIndexBufferSize = 2_MB;
static constexpr usize hwVertexBufferSize = 16_MB;
hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
// Allocate memory for the shadergen fragment uniform UBO
glGenBuffers(1, &shadergenFragmentUBO);
gl.bindUBO(shadergenFragmentUBO);
glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
gl.bindVBO(vbo);
vao.create();
gl.bindVAO(vao);
// Allocate memory for the accelerated vertex shader uniform UBO
glGenBuffers(1, &hwShaderUniformUBO);
gl.bindUBO(hwShaderUniformUBO);
glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
vbo.bind();
// Initialize the VAO used when not using hw shaders
defaultVAO.create();
gl.bindVAO(defaultVAO);
// Position (x, y, z, w) attributes
vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
vao.enableAttribute(0);
defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
defaultVAO.enableAttribute(0);
// Quaternion attribute
vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
vao.enableAttribute(1);
defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
defaultVAO.enableAttribute(1);
// Colour attribute
vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
vao.enableAttribute(2);
defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
defaultVAO.enableAttribute(2);
// UV 0 attribute
vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
vao.enableAttribute(3);
defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
defaultVAO.enableAttribute(3);
// UV 1 attribute
vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
vao.enableAttribute(4);
defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
defaultVAO.enableAttribute(4);
// UV 0 W-component attribute
vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
vao.enableAttribute(5);
defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
defaultVAO.enableAttribute(5);
// View
vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
vao.enableAttribute(6);
defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
defaultVAO.enableAttribute(6);
// UV 2 attribute
vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
vao.enableAttribute(7);
defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
defaultVAO.enableAttribute(7);
// Initialize the VAO used for hw shaders
hwShaderVAO.create();
dummyVBO.create();
dummyVAO.create();
@ -165,8 +184,18 @@ void RendererGL::initGraphicsContextInternal() {
OpenGL::clearColor();
OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);
// Initialize fixed attributes
for (int i = 0; i < fixedAttrValues.size(); i++) {
fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
}
reset();
// Populate our driver info structure
driverInfo.supportsExtFbFetch = (GLAD_GL_EXT_shader_framebuffer_fetch != 0);
driverInfo.supportsArmFbFetch = (GLAD_GL_ARM_shader_framebuffer_fetch != 0);
// Initialize the default vertex shader used with shadergen
std::string defaultShadergenVSSource = fragShaderGen.getDefaultVertexShader();
defaultShadergenVs.create({defaultShadergenVSSource.c_str(), defaultShadergenVSSource.size()}, OpenGL::Vertex);
@ -414,29 +443,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
OpenGL::Triangle,
};
bool usingUbershader = enableUbershader;
if (usingUbershader) {
const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
usingUbershader = false;
}
}
if (usingUbershader) {
gl.useProgram(triangleProgram);
} else {
OpenGL::Program& program = getSpecializedShader();
gl.useProgram(program);
}
const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
gl.disableScissor();
gl.bindVBO(vbo);
gl.bindVAO(vao);
// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
if (!usingAcceleratedShader) {
vbo.bind();
gl.bindVAO(defaultVAO);
}
gl.enableClipPlane(0); // Clipping plane 0 is always enabled
if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@ -454,38 +468,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
const int depthFunc = getBits<4, 3>(depthControl);
const int colourMask = getBits<8, 4>(depthControl);
gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
// Update ubershader uniforms
if (usingUbershader) {
const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
if (oldDepthScale != depthScale) {
oldDepthScale = depthScale;
glUniform1f(ubershaderData.depthScaleLoc, depthScale);
}
if (oldDepthOffset != depthOffset) {
oldDepthOffset = depthOffset;
glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
}
if (oldDepthmapEnable != depthMapEnable) {
oldDepthmapEnable = depthMapEnable;
glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
}
// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
setupUbershaderTexEnv();
}
bindTexturesToSlots();
if (gpu.fogLUTDirty) {
updateFogLUT();
}
@ -528,8 +513,32 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
setupStencilTest(stencilEnable);
vbo.bufferVertsSub(vertices);
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
if (!usingAcceleratedShader) {
vbo.bufferVertsSub(vertices);
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
} else {
if (performIndexedRender) {
// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
hwIndexBuffer->Bind();
if (glDrawRangeElementsBaseVertex != nullptr) [[likely]] {
glDrawRangeElementsBaseVertex(
primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
hwIndexBufferOffset, -GLint(minimumIndex)
);
} else {
// If glDrawRangeElementsBaseVertex is not available then prepareForDraw will have subtracted the base vertex from the index buffer
// for us, so just use glDrawRangeElements
glDrawRangeElements(
primitiveTopology, 0, GLint(maximumIndex - minimumIndex), GLsizei(vertices.size()),
usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, hwIndexBufferOffset
);
}
} else {
// When doing non-indexed rendering, just use glDrawArrays
OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
}
}
}
void RendererGL::display() {
@ -836,34 +845,53 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
}
OpenGL::Program& RendererGL::getSpecializedShader() {
constexpr uint uboBlockBinding = 2;
constexpr uint vsUBOBlockBinding = 1;
constexpr uint fsUBOBlockBinding = 2;
PICA::FragmentConfig fsConfig(regs);
// If we're not on GLES, ignore the logic op configuration and don't generate redundant shaders for it, since we use hw logic ops
#ifndef USING_GLES
fsConfig.outConfig.logicOpMode = PICA::LogicOpMode(0);
#endif
CachedProgram& programEntry = shaderCache[fsConfig];
OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
if (!fragShader.exists()) {
std::string fs = fragShaderGen.generate(fsConfig);
fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
}
// Get the handle of the current vertex shader
OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
// And form the key for looking up a shader program
const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
CachedProgram& programEntry = shaderCache.programCache[programKey];
OpenGL::Program& program = programEntry.program;
if (!program.exists()) {
std::string fs = fragShaderGen.generate(fsConfig);
OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
program.create({defaultShadergenVs, fragShader});
program.create({vertexShader, fragShader});
gl.useProgram(program);
fragShader.free();
// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
// As it's an OpenGL 4.2 feature that MacOS doesn't support...
uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
if (usingAcceleratedShader) {
uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
}
}
glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
if (usingAcceleratedShader) {
glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
}
glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);
// Upload uniform data to our shader's UBO
PICA::FragmentUniforms uniforms;
@ -953,6 +981,101 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
return program;
}
bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
// First we figure out if we will be using an ubershader
bool usingUbershader = emulatorConfig->useUbershaders;
if (usingUbershader) {
const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
usingUbershader = false;
}
}
// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
// TODO: Ubershader support for accelerated shaders
usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
if (usingAcceleratedShader) {
PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
if (!shader.has_value()) {
// Initialize shader to a "null" shader (handle == 0)
shader = OpenGL::Shader();
std::string picaShaderSource = PICA::ShaderGen::decompileShader(
shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
);
// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
// it to the GPU
if (!picaShaderSource.empty()) {
std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
shader->create({vertexShaderSource}, OpenGL::Vertex);
}
}
// Shader generation did not work out, so set usingAcceleratedShader to false
if (!shader->exists()) {
usingAcceleratedShader = false;
} else {
generatedVertexShader = &(*shader);
gl.bindUBO(hwShaderUniformUBO);
if (shaderUnit.vs.uniformsDirty) {
shaderUnit.vs.uniformsDirty = false;
glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
}
performIndexedRender = accel->indexed;
minimumIndex = GLsizei(accel->minimumIndex);
maximumIndex = GLsizei(accel->maximumIndex);
// Upload vertex data and index buffer data to our GPU
accelerateVertexUpload(shaderUnit, accel);
}
}
if (!usingUbershader) {
OpenGL::Program& program = getSpecializedShader();
gl.useProgram(program);
} else { // Bind ubershader & load ubershader uniforms
gl.useProgram(triangleProgram);
const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
if (oldDepthScale != depthScale) {
oldDepthScale = depthScale;
glUniform1f(ubershaderData.depthScaleLoc, depthScale);
}
if (oldDepthOffset != depthOffset) {
oldDepthOffset = depthOffset;
glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
}
if (oldDepthmapEnable != depthMapEnable) {
oldDepthmapEnable = depthMapEnable;
glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
}
// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
setupUbershaderTexEnv();
}
return usingAcceleratedShader;
}
void RendererGL::screenshot(const std::string& name) {
constexpr uint width = 400;
constexpr uint height = 2 * 240;
@ -966,7 +1089,7 @@ void RendererGL::screenshot(const std::string& name) {
// Flip the image vertically
for (int y = 0; y < height; y++) {
memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
// Swap R and B channels
for (int x = 0; x < width; x++) {
std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
@ -978,21 +1101,12 @@ void RendererGL::screenshot(const std::string& name) {
stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
}
void RendererGL::clearShaderCache() {
for (auto& shader : shaderCache) {
CachedProgram& cachedProgram = shader.second;
cachedProgram.program.free();
}
shaderCache.clear();
}
void RendererGL::deinitGraphicsContext() {
// Invalidate all surface caches since they'll no longer be valid
textureCache.reset();
depthBufferCache.reset();
colourBufferCache.reset();
clearShaderCache();
shaderCache.clear();
// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@ -1041,3 +1155,99 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
}
void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
u32 buffer = 0; // Vertex buffer index for non-fixed attributes
u32 attrCount = 0;
const u32 totalAttribCount = accel->totalAttribCount;
static constexpr GLenum attributeFormats[4] = {
GL_BYTE, // 0: Signed byte
GL_UNSIGNED_BYTE, // 1: Unsigned byte
GL_SHORT, // 2: Short
GL_FLOAT, // 3: Float
};
const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
// Update index buffer if necessary
if (accel->indexed) {
usingShortIndices = accel->useShortIndices;
const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
hwIndexBuffer->Bind();
auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
// If we don't have glDrawRangeElementsBaseVertex, we must subtract the base index value from our index buffer manually
if (glDrawRangeElementsBaseVertex == nullptr) [[unlikely]] {
const u32 indexCount = regs[PICA::InternalRegs::VertexCountReg];
usingShortIndices ? PICA::IndexBuffer::subtractBaseIndex<true>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex)
: PICA::IndexBuffer::subtractBaseIndex<false>((u8*)indexBufferRes.pointer, indexCount, accel->minimumIndex);
}
hwIndexBuffer->Unmap(indexBufferSize);
}
hwVertexBuffer->Bind();
auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
gl.bindVAO(hwShaderVAO);
// Enable or disable vertex attributes as needed
const u32 currentAttributeMask = accel->enabledAttributeMask;
// Use bitwise xor to calculate which attributes changed
u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
while (attributeMaskDiff != 0) {
// Get index of next different attribute and turn it off
const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
const u32 mask = 1u << index;
attributeMaskDiff ^= mask;
if ((currentAttributeMask & mask) != 0) {
// Attribute was disabled and is now enabled
hwShaderVAO.enableAttribute(index);
} else {
// Attribute was enabled and is now disabled
hwShaderVAO.disableAttribute(index);
}
}
previousAttributeMask = currentAttributeMask;
// Upload the data for each (enabled) attribute loader into our vertex buffer
for (int i = 0; i < accel->totalLoaderCount; i++) {
auto& loader = accel->loaders[i];
std::memcpy(vertexData, loader.data, loader.size);
vertexData += loader.size;
}
hwVertexBuffer->Unmap(accel->vertexDataSize);
// Iterate over the 16 PICA input registers and configure how they should be fetched.
for (int i = 0; i < 16; i++) {
const auto& attrib = accel->attributeInfo[i];
const u32 attributeMask = 1u << i;
if (accel->fixedAttributes & attributeMask) {
auto& attrValue = fixedAttrValues[i];
// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
attrValue[3] != attrib.fixedValue[3]) {
std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
}
} else if (accel->enabledAttributeMask & attributeMask) {
glVertexAttribPointer(
i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
);
}
}
}

109
src/core/services/fonts.cpp Normal file
View file

@ -0,0 +1,109 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.cpp
#include "services/fonts.hpp"
#include <cstring>
namespace HLE::Fonts {
void relocateSharedFont(u8* sharedFont, u32 newAddress) {
constexpr u32 sharedFontStartOffset = 0x80;
const u8* cfntData = &sharedFont[sharedFontStartOffset];
CFNT cfnt;
std::memcpy(&cfnt, cfntData, sizeof(cfnt));
u32 assumedCmapOffset = 0;
u32 assumedCwdhOffset = 0;
u32 assumedTglpOffset = 0;
u32 firstCmapOffset = 0;
u32 firstCwdhOffset = 0;
u32 firstTglpOffset = 0;
// First discover the location of sections so that the rebase offset can be auto-detected
u32 currentOffset = sharedFontStartOffset + cfnt.headerSize;
for (uint block = 0; block < cfnt.numBlocks; ++block) {
const u8* data = &sharedFont[currentOffset];
SectionHeader sectionHeader;
std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
if (firstCmapOffset == 0 && std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
firstCmapOffset = currentOffset;
} else if (firstCwdhOffset == 0 && std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
firstCwdhOffset = currentOffset;
} else if (firstTglpOffset == 0 && std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
firstTglpOffset = currentOffset;
} else if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
Fonts::FINF finf;
std::memcpy(&finf, data, sizeof(finf));
assumedCmapOffset = finf.cmapOffset - sizeof(SectionHeader);
assumedCwdhOffset = finf.cwdhOffset - sizeof(SectionHeader);
assumedTglpOffset = finf.tglpOffset - sizeof(SectionHeader);
}
currentOffset += sectionHeader.sectionSize;
}
u32 previousBase = assumedCmapOffset - firstCmapOffset;
if ((previousBase != assumedCwdhOffset - firstCwdhOffset) || (previousBase != assumedTglpOffset - firstTglpOffset)) {
Helpers::warn("You shouldn't be seeing this. Shared Font file offsets might be borked?");
}
u32 offset = newAddress - previousBase;
// Reset pointer back to start of sections and do the actual rebase
currentOffset = sharedFontStartOffset + cfnt.headerSize;
for (uint block = 0; block < cfnt.numBlocks; ++block) {
u8* data = &sharedFont[currentOffset];
SectionHeader sectionHeader;
std::memcpy(&sectionHeader, data, sizeof(sectionHeader));
if (std::memcmp(sectionHeader.magic, "FINF", 4) == 0) {
Fonts::FINF finf;
std::memcpy(&finf, data, sizeof(finf));
// Relocate the offsets in the FINF section
finf.cmapOffset += offset;
finf.cwdhOffset += offset;
finf.tglpOffset += offset;
std::memcpy(data, &finf, sizeof(finf));
} else if (std::memcmp(sectionHeader.magic, "CMAP", 4) == 0) {
Fonts::CMAP cmap;
std::memcpy(&cmap, data, sizeof(cmap));
// Relocate the offsets in the CMAP section
if (cmap.nextCmapOffset != 0) {
cmap.nextCmapOffset += offset;
}
std::memcpy(data, &cmap, sizeof(cmap));
} else if (std::memcmp(sectionHeader.magic, "CWDH", 4) == 0) {
Fonts::CWDH cwdh;
std::memcpy(&cwdh, data, sizeof(cwdh));
// Relocate the offsets in the CWDH section
if (cwdh.nextCwdhOffset != 0) {
cwdh.nextCwdhOffset += offset;
}
std::memcpy(data, &cwdh, sizeof(cwdh));
} else if (std::memcmp(sectionHeader.magic, "TGLP", 4) == 0) {
Fonts::TGLP tglp;
std::memcpy(&tglp, data, sizeof(tglp));
// Relocate the offsets in the TGLP section
tglp.sheetDataOffset += offset;
std::memcpy(data, &tglp, sizeof(tglp));
}
currentOffset += sectionHeader.sectionSize;
}
}
} // namespace HLE::Fonts

View file

@ -8,6 +8,7 @@
#include "renderer_gl/renderer_gl.hpp"
#include "services/hid.hpp"
#include "android_utils.hpp"
#include "sdl_sensors.hpp"
std::unique_ptr<Emulator> emulator = nullptr;
HIDService* hidService = nullptr;
@ -43,6 +44,7 @@ extern "C" {
AlberFunction(void, functionName) (JNIEnv* env, jobject obj, type value) { emulator->getConfig().settingName = value; }
MAKE_SETTING(setShaderJitEnabled, jboolean, shaderJitEnabled)
MAKE_SETTING(setAccurateShaderMulEnable, jboolean, accurateShaderMul)
#undef MAKE_SETTING
@ -87,6 +89,7 @@ AlberFunction(void, Finalize)(JNIEnv* env, jobject obj) {
emulator = nullptr;
hidService = nullptr;
renderer = nullptr;
romLoaded = false;
}
AlberFunction(jboolean, HasRomLoaded)(JNIEnv* env, jobject obj) { return romLoaded; }
@ -110,6 +113,19 @@ AlberFunction(void, TouchScreenUp)(JNIEnv* env, jobject obj) { hidService->relea
AlberFunction(void, KeyUp)(JNIEnv* env, jobject obj, jint keyCode) { hidService->releaseKey((u32)keyCode); }
AlberFunction(void, KeyDown)(JNIEnv* env, jobject obj, jint keyCode) { hidService->pressKey((u32)keyCode); }
AlberFunction(void, SetGyro)(JNIEnv* env, jobject obj, jfloat roll, jfloat pitch, jfloat yaw) {
auto rotation = Sensors::SDL::convertRotation({ float(roll), float(pitch), float(yaw) });
hidService->setPitch(s16(rotation.x));
hidService->setRoll(s16(rotation.y));
hidService->setYaw(s16(rotation.z));
}
AlberFunction(void, SetAccel)(JNIEnv* env, jobject obj, jfloat rawX, jfloat rawY, jfloat rawZ) {
float data[3] = { float(rawX), float(rawY), float(rawZ) };
auto accel = Sensors::SDL::convertAcceleration(data);
hidService->setAccel(accel.x, accel.y, accel.z);
}
AlberFunction(void, SetCirclepadAxis)(JNIEnv* env, jobject obj, jint x, jint y) {
hidService->setCirclepadX((s16)x);
hidService->setCirclepadY((s16)y);
@ -139,4 +155,4 @@ int AndroidUtils::openDocument(const char* path, const char* perms) {
env->DeleteLocalRef(jmode);
return (int)result;
}
}

View file

@ -163,13 +163,14 @@ static int fetchVariableRange(std::string key, int min, int max) {
static void configInit() {
static const retro_variable values[] = {
{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled"
: "Enable shader JIT; disabled|enabled"},
{"panda3ds_use_shader_jit", EmulatorConfig::shaderJitDefault ? "Enable shader JIT; enabled|disabled" : "Enable shader JIT; disabled|enabled"},
{"panda3ds_accelerate_shaders",
EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
: "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
{"panda3ds_use_vsync", "Enable VSync; enabled|disabled"},
{"panda3ds_dsp_emulation", "DSP emulation; Null|HLE|LLE"},
{"panda3ds_dsp_emulation", "DSP emulation; HLE|LLE|Null"},
{"panda3ds_use_audio", "Enable audio; disabled|enabled"},
{"panda3ds_use_virtual_sd", "Enable virtual SD card; enabled|disabled"},
{"panda3ds_write_protect_virtual_sd", "Write protect virtual SD card; disabled|enabled"},
@ -197,6 +198,8 @@ static void configUpdate() {
config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);
config.discordRpcEnabled = false;

View file

@ -130,6 +130,32 @@ MAKE_MEMORY_FUNCTIONS(32)
MAKE_MEMORY_FUNCTIONS(64)
#undef MAKE_MEMORY_FUNCTIONS
static int readFloatThunk(lua_State* L) {
const u32 vaddr = (u32)lua_tonumber(L, 1);
lua_pushnumber(L, (lua_Number)Helpers::bit_cast<float, u32>(LuaManager::g_emulator->getMemory().read32(vaddr)));
return 1;
}
static int writeFloatThunk(lua_State* L) {
const u32 vaddr = (u32)lua_tonumber(L, 1);
const float value = (float)lua_tonumber(L, 2);
LuaManager::g_emulator->getMemory().write32(vaddr, Helpers::bit_cast<u32, float>(value));
return 0;
}
static int readDoubleThunk(lua_State* L) {
const u32 vaddr = (u32)lua_tonumber(L, 1);
lua_pushnumber(L, (lua_Number)Helpers::bit_cast<double, u64>(LuaManager::g_emulator->getMemory().read64(vaddr)));
return 1;
}
static int writeDoubleThunk(lua_State* L) {
const u32 vaddr = (u32)lua_tonumber(L, 1);
const double value = (double)lua_tonumber(L, 2);
LuaManager::g_emulator->getMemory().write64(vaddr, Helpers::bit_cast<u64, double>(value));
return 0;
}
static int getAppIDThunk(lua_State* L) {
std::optional<u64> id = LuaManager::g_emulator->getMemory().getProgramID();
@ -248,10 +274,14 @@ static constexpr luaL_Reg functions[] = {
{ "__read16", read16Thunk },
{ "__read32", read32Thunk },
{ "__read64", read64Thunk },
{ "__readFloat", readFloatThunk },
{ "__readDouble", readDoubleThunk },
{ "__write8", write8Thunk} ,
{ "__write16", write16Thunk },
{ "__write32", write32Thunk },
{ "__write64", write64Thunk },
{ "__writeFloat", writeFloatThunk },
{ "__writeDouble", writeDoubleThunk },
{ "__getAppID", getAppIDThunk },
{ "__pause", pauseThunk },
{ "__resume", resumeThunk },
@ -273,10 +303,15 @@ void LuaManager::initializeThunks() {
read16 = function(addr) return GLOBALS.__read16(addr) end,
read32 = function(addr) return GLOBALS.__read32(addr) end,
read64 = function(addr) return GLOBALS.__read64(addr) end,
readFloat = function(addr) return GLOBALS.__readFloat(addr) end,
readDouble = function(addr) return GLOBALS.__readDouble(addr) end,
write8 = function(addr, value) GLOBALS.__write8(addr, value) end,
write16 = function(addr, value) GLOBALS.__write16(addr, value) end,
write32 = function(addr, value) GLOBALS.__write32(addr, value) end,
write64 = function(addr, value) GLOBALS.__write64(addr, value) end,
writeFloat = function(addr, value) GLOBALS.__writeFloat(addr, value) end,
writeDouble = function(addr, value) GLOBALS.__writeDouble(addr, value) end,
getAppID = function()
local ffi = require("ffi")

View file

@ -24,13 +24,16 @@ public class AlberDriver {
public static native void KeyUp(int code);
public static native void SetCirclepadAxis(int x, int y);
public static native void TouchScreenUp();
public static native void TouchScreenDown(int x, int y);
public static native void TouchScreenDown(int x, int y);;
public static native void SetGyro(float roll, float pitch, float yaw);
public static native void SetAccel(float x, float y, float z);
public static native void Pause();
public static native void Resume();
public static native void LoadLuaScript(String script);
public static native byte[] GetSmdh();
public static native void setShaderJitEnabled(boolean enable);
public static native void setAccurateShaderMulEnable(boolean enable);
public static int openDocument(String path, String mode) {
try {

View file

@ -3,11 +3,22 @@ package com.panda3ds.pandroid.app;
import android.app.ActivityManager;
import android.app.PictureInPictureParams;
import android.content.Intent;
import android.content.res.Configuration;
import android.hardware.Sensor;
import android.hardware.SensorEvent;
import android.hardware.SensorEventListener;
import android.hardware.SensorManager;
import android.opengl.Matrix;
import android.os.Build;
import android.os.Bundle;
import android.renderscript.Matrix3f;
import android.renderscript.Matrix4f;
import android.util.Log;
import android.util.Rational;
import android.view.Display;
import android.view.KeyEvent;
import android.view.MotionEvent;
import android.view.Surface;
import android.view.View;
import android.view.ViewGroup;
import android.view.WindowManager;
@ -25,6 +36,7 @@ import com.panda3ds.pandroid.app.game.EmulatorCallback;
import com.panda3ds.pandroid.data.config.GlobalConfig;
import com.panda3ds.pandroid.input.InputHandler;
import com.panda3ds.pandroid.input.InputMap;
import com.panda3ds.pandroid.math.Vector3;
import com.panda3ds.pandroid.utils.Constants;
import com.panda3ds.pandroid.view.PandaGlSurfaceView;
import com.panda3ds.pandroid.view.PandaLayoutController;
@ -32,7 +44,7 @@ import com.panda3ds.pandroid.view.ds.DsLayoutManager;
import com.panda3ds.pandroid.view.renderer.ConsoleRenderer;
import com.panda3ds.pandroid.view.utils.PerformanceView;
public class GameActivity extends BaseActivity implements EmulatorCallback {
public class GameActivity extends BaseActivity implements EmulatorCallback, SensorEventListener {
private final DrawerFragment drawerFragment = new DrawerFragment();
private final AlberInputListener inputListener = new AlberInputListener(this);
private ConsoleRenderer renderer;
@ -74,6 +86,19 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
((FrameLayout) findViewById(R.id.panda_gl_frame)).addView(view, new FrameLayout.LayoutParams(ViewGroup.LayoutParams.WRAP_CONTENT, ViewGroup.LayoutParams.WRAP_CONTENT));
}
swapScreens(GlobalConfig.get(GlobalConfig.KEY_CURRENT_DS_LAYOUT));
registerSensors();
}
private void registerSensors() {
SensorManager sensorManager = (SensorManager) getSystemService(SENSOR_SERVICE);
Sensor accel = sensorManager.getDefaultSensor(Sensor.TYPE_ACCELEROMETER);
if (accel != null) {
sensorManager.registerListener(this, accel, 1);
}
Sensor gryro = sensorManager.getDefaultSensor(Sensor.TYPE_GYROSCOPE);
if (gryro != null) {
sensorManager.registerListener(this, gryro, 1);
}
}
private void changeOverlayVisibility(boolean visible) {
@ -85,7 +110,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
@Override
protected void onResume() {
super.onResume();
getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
getWindow().getDecorView().setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN | View.SYSTEM_UI_FLAG_HIDE_NAVIGATION);
getWindow().addFlags(WindowManager.LayoutParams.FLAG_FULLSCREEN);
InputHandler.reset();
@ -94,6 +119,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O_MR1) {
getTheme().applyStyle(R.style.GameActivityNavigationBar, true);
}
registerSensors();
}
private void enablePIP() {
@ -113,6 +139,7 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
protected void onPause() {
super.onPause();
((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
InputHandler.reset();
if (GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE)) {
if (Build.VERSION.SDK_INT > Build.VERSION_CODES.O) {
@ -174,10 +201,48 @@ public class GameActivity extends BaseActivity implements EmulatorCallback {
@Override
protected void onDestroy() {
((SensorManager)getSystemService(SENSOR_SERVICE)).unregisterListener(this);
if (AlberDriver.HasRomLoaded()) {
AlberDriver.Finalize();
}
super.onDestroy();
}
private float getDeviceRotationAngle() {
if (getWindow().getDecorView() == null || getWindow().getDecorView().getDisplay() == null)
return 0.0f;
int rotation = getWindow().getDecorView().getDisplay().getRotation();
switch (rotation) {
case Surface.ROTATION_90: return 90.0f;
case Surface.ROTATION_180: return 180.0f;
case Surface.ROTATION_270: return -90.0f;
default: return 0.0f;
}
}
@Override
public void onSensorChanged(SensorEvent event) {
if (AlberDriver.HasRomLoaded()) {
Sensor sensor = event.sensor;
switch (sensor.getType()) {
case Sensor.TYPE_ACCELEROMETER: {
float[] values = event.values;
Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
AlberDriver.SetAccel(vec3.x, vec3.y, vec3.z);
} break;
case Sensor.TYPE_GYROSCOPE: {
float[] values = event.values;
Vector3 vec3 = new Vector3(values[0], values[1], values[2]);
vec3.rotateByEuler(new Vector3(0, 0, (float) (getDeviceRotationAngle() * (Math.PI / 180.0f))));
AlberDriver.SetGyro(vec3.x, vec3.y, vec3.z);
} break;
}
}
}
@Override
public void onAccuracyChanged(Sensor sensor, int accuracy) {}
}

View file

@ -26,6 +26,10 @@ public abstract class BasePreferenceFragment extends PreferenceFragmentCompat {
((SwitchPreferenceCompat)findPreference(id)).setChecked(value);
}
protected void setSummaryValue(String id,String text) {
findPreference(id).setSummary(text);
}
protected void setActivityTitle(@StringRes int titleId) {
ActionBar header = ((AppCompatActivity) requireActivity()).getSupportActionBar();
if (header != null) {

View file

@ -22,6 +22,7 @@ public class AdvancedPreferences extends BasePreferenceFragment {
setItemClick("performanceMonitor", pref -> GlobalConfig.set(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY, ((SwitchPreferenceCompat) pref).isChecked()));
setItemClick("shaderJit", pref -> GlobalConfig.set(GlobalConfig.KEY_SHADER_JIT, ((SwitchPreferenceCompat) pref).isChecked()));
setItemClick("accurateShaderMul", pref -> GlobalConfig.set(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY, ((SwitchPreferenceCompat) pref).isChecked()));
setItemClick("loggerService", pref -> {
boolean checked = ((SwitchPreferenceCompat) pref).isChecked();
Context ctx = PandroidApplication.getAppContext();
@ -46,5 +47,6 @@ public class AdvancedPreferences extends BasePreferenceFragment {
((SwitchPreferenceCompat) findPreference("performanceMonitor")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHOW_PERFORMANCE_OVERLAY));
((SwitchPreferenceCompat) findPreference("loggerService")).setChecked(GlobalConfig.get(GlobalConfig.KEY_LOGGER_SERVICE));
((SwitchPreferenceCompat) findPreference("shaderJit")).setChecked(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
((SwitchPreferenceCompat) findPreference("accurateShaderMul")).setChecked(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));
}
}

View file

@ -1,7 +1,13 @@
package com.panda3ds.pandroid.app.preferences;
import android.net.Uri;
import android.os.Bundle;
import android.util.Log;
import android.widget.Toast;
import androidx.activity.result.ActivityResultCallback;
import androidx.activity.result.ActivityResultLauncher;
import androidx.activity.result.contract.ActivityResultContracts;
import androidx.annotation.Nullable;
import androidx.preference.SwitchPreferenceCompat;
@ -10,8 +16,11 @@ import com.panda3ds.pandroid.app.PreferenceActivity;
import com.panda3ds.pandroid.app.base.BasePreferenceFragment;
import com.panda3ds.pandroid.app.preferences.screen_editor.ScreenLayoutsPreference;
import com.panda3ds.pandroid.data.config.GlobalConfig;
import com.panda3ds.pandroid.utils.FileUtils;
public class GeneralPreferences extends BasePreferenceFragment {
public class GeneralPreferences extends BasePreferenceFragment implements ActivityResultCallback<Uri> {
private final ActivityResultContracts.OpenDocument openFolderContract = new ActivityResultContracts.OpenDocument();
private ActivityResultLauncher<String[]> pickFileRequest;
@Override
public void onCreatePreferences(@Nullable Bundle savedInstanceState, @Nullable String rootKey) {
setPreferencesFromResource(R.xml.general_preference, rootKey);
@ -21,6 +30,11 @@ public class GeneralPreferences extends BasePreferenceFragment {
setItemClick("behavior.pictureInPicture", (pref)-> GlobalConfig.set(GlobalConfig.KEY_PICTURE_IN_PICTURE, ((SwitchPreferenceCompat)pref).isChecked()));
setActivityTitle(R.string.general);
refresh();
setItemClick("games.aes_key", pref -> pickFileRequest.launch(new String[]{ "text/plain" }));
setItemClick("games.seed_db", pref -> pickFileRequest.launch(new String[]{ "application/octet-stream" }));
pickFileRequest = registerForActivityResult(openFolderContract, this);
}
@Override
@ -31,5 +45,45 @@ public class GeneralPreferences extends BasePreferenceFragment {
private void refresh() {
setSwitchValue("behavior.pictureInPicture", GlobalConfig.get(GlobalConfig.KEY_PICTURE_IN_PICTURE));
setSummaryValue("games.aes_key", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/aes_keys.txt") ? R.string.file_available : R.string.file_not_available), "aes_keys.txt"));
setSummaryValue("games.seed_db", String.format(getString(FileUtils.exists(FileUtils.getPrivatePath()+"/sysdata/seeddb.bin") ? R.string.file_available : R.string.file_not_available), "seeddb.bin"));
}
@Override
public void onDestroy() {
super.onDestroy();
if (pickFileRequest != null) {
pickFileRequest.unregister();
pickFileRequest = null;
}
}
@Override
public void onActivityResult(Uri result) {
if (result != null) {
String path = result.toString();
Log.w("File", path + " -> " + FileUtils.getName(path));
switch (String.valueOf(FileUtils.getName(path))) {
case "aes_keys.txt":
case "seeddb.bin": {
String name = FileUtils.getName(path);
if (FileUtils.getLength(path) < 1024 * 256) {
String sysdataFolder = FileUtils.getPrivatePath() + "/sysdata";
if (!FileUtils.exists(sysdataFolder)) {
FileUtils.createDir(FileUtils.getPrivatePath(), "sysdata");
}
if (FileUtils.exists(sysdataFolder + "/" + name)) {
FileUtils.delete(sysdataFolder + "/" + name);
}
FileUtils.copyFile(path, FileUtils.getPrivatePath() + "/sysdata/", name);
Toast.makeText(getActivity(), String.format(getString(R.string.file_imported), name), Toast.LENGTH_LONG).show();
} else {
Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show();
}
} break;
default: Toast.makeText(getActivity(), R.string.invalid_file, Toast.LENGTH_LONG).show(); break;
}
refresh();
}
}
}

View file

@ -23,7 +23,7 @@ public class ScreenEditorPreference extends Fragment {
@Override
public View onCreateView(@NonNull LayoutInflater inflater, @Nullable ViewGroup container, @Nullable Bundle savedInstanceState) {
layout = new LinearLayout(container.getContext());
layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_HIDE_NAVIGATION|View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
layout.setSystemUiVisibility(View.SYSTEM_UI_FLAG_FULLSCREEN|View.SYSTEM_UI_FLAG_IMMERSIVE);
return layout;
}

View file

@ -95,7 +95,7 @@ public class AppDataDocumentProvider extends DocumentsProvider {
private void includeFile(MatrixCursor cursor, File file) {
int flags = 0;
if (file.isDirectory()) {
flags = Document.FLAG_DIR_SUPPORTS_CREATE;
flags = Document.FLAG_DIR_SUPPORTS_CREATE | Document.FLAG_SUPPORTS_DELETE;
} else {
flags = Document.FLAG_SUPPORTS_WRITE | Document.FLAG_SUPPORTS_REMOVE | Document.FLAG_SUPPORTS_DELETE;
}

View file

@ -22,6 +22,7 @@ public class GlobalConfig {
public static DataModel data;
public static final Key<Boolean> KEY_SHADER_JIT = new Key<>("emu.shader_jit", true);
public static final Key<Boolean> KEY_ACCURATE_SHADER_MULTIPLY = new Key<>("emu.accurate_shader_mul", false);
public static final Key<Boolean> KEY_PICTURE_IN_PICTURE = new Key<>("app.behavior.pictureInPicture", false);
public static final Key<Boolean> KEY_SHOW_PERFORMANCE_OVERLAY = new Key<>("dev.performanceOverlay", false);
public static final Key<Boolean> KEY_LOGGER_SERVICE = new Key<>("dev.loggerService", false);

View file

@ -0,0 +1,31 @@
package com.panda3ds.pandroid.math;
public class Quaternion {
public float x, y, z, w;
public Quaternion(float x, float y, float z, float w) {
this.x = x;
this.y = y;
this.z = z;
this.w = w;
}
public Quaternion fromEuler(Vector3 euler) {
float x = euler.x;
float y = euler.y;
float z = euler.z;
double c1 = Math.cos(x / 2.0);
double c2 = Math.cos(y / 2.0);
double c3 = Math.cos(z / 2.0);
double s1 = Math.sin(x / 2.0);
double s2 = Math.sin(y / 2.0);
double s3 = Math.sin(z / 2.0);
this.x = (float) (s1 * c2 * c3 + c1 * s2 * s3);
this.y = (float) (c1 * s2 * c3 - s1 * c2 * s3);
this.z = (float) (c1 * c2 * s3 + s1 * s2 * c3);
this.w = (float) (c1 * c2 * c3 - s1 * s2 * s3);
return this;
}
}

View file

@ -0,0 +1,32 @@
package com.panda3ds.pandroid.math;
public class Vector3 {
private final Quaternion quaternion = new Quaternion(0, 0, 0, 0);
public float x, y, z;
public Vector3(float x, float y, float z) {
this.x = x;
this.y = y;
this.z = z;
}
public Vector3 rotateByEuler(Vector3 euler) {
this.quaternion.fromEuler(euler);
float x = this.x, y = this.y, z = this.z;
float qx = this.quaternion.x;
float qy = this.quaternion.y;
float qz = this.quaternion.z;
float qw = this.quaternion.w;
float ix = qw * x + qy * z - qz * y;
float iy = qw * y + qz * x - qx * z;
float iz = qw * z + qx * y - qy * x;
float iw = -qx * x - qy * qz * z;
this.x = ix * qw + iw * -qx + iy * -qz - iz * -qy;
this.y = iy * qw + iw * -qy + iz * -qx - ix * -qz;
this.z = iz * qw + iw * -qz + ix * -qy - iy * -qx;
return this;
}
}

View file

@ -230,6 +230,10 @@ public class FileUtils {
return parseFile(path).lastModified();
}
public static long getLength(String path) {
return parseFile(path).length();
}
public static String[] listFiles(String path) {
DocumentFile folder = parseFile(path);
DocumentFile[] files = folder.listFiles();

View file

@ -93,6 +93,7 @@ public class PandaGlRenderer implements GLSurfaceView.Renderer, ConsoleRenderer
AlberDriver.Initialize();
AlberDriver.setShaderJitEnabled(GlobalConfig.get(GlobalConfig.KEY_SHADER_JIT));
AlberDriver.setAccurateShaderMulEnable(GlobalConfig.get(GlobalConfig.KEY_ACCURATE_SHADER_MULTIPLY));
// If loading the ROM failed, display an error message and early exit
if (!AlberDriver.LoadRom(romPath)) {

View file

@ -90,4 +90,12 @@
<string name="behavior">Comportamento</string>
<string name="invalid_game">Jogo invalido</string>
<string name="tools">Ferramentas</string>
<string name="pref_accurate_shader_title">Multiplicação precisa de shader</string>
<string name="pref_accurate_shader_summary">Usar calculos mais precisos para shaders</string>
<string name="pref_game_crypto_keys">Importar chaves</string>
<string name="file_available">%s disponível</string>
<string name="file_not_available">%s não disponível</string>
<string name="pref_game_seed_db_keys">Importar SeedDB</string>
<string name="invalid_file">Arquivo inválido</string>
<string name="file_imported">%s Importado</string>
</resources>

View file

@ -96,4 +96,12 @@
<string name="region_taiwan">Taiwan</string>
<string name="behavior">Behavior</string>
<string name="invalid_game">Invalid game</string>
<string name="pref_accurate_shader_title">Accurate shader multiplication</string>
<string name="pref_accurate_shader_summary">Can improve rendering at a small performance loss</string>
<string name="pref_game_crypto_keys">Import keys</string>
<string name="file_imported">%s imported</string>
<string name="file_available">%s available</string>
<string name="file_not_available">%s not available</string>
<string name="pref_game_seed_db_keys">Import SeedDB</string>
<string name="invalid_file">Invalid file</string>
</resources>

View file

@ -28,5 +28,11 @@
app:summary="@string/pref_shader_jit_summary"
app:iconSpaceReserved="false"/>
<SwitchPreferenceCompat
app:key="accurateShaderMul"
app:title="@string/pref_accurate_shader_title"
app:summary="@string/pref_accurate_shader_summary"
app:iconSpaceReserved="false"/>
</PreferenceCategory>
</PreferenceScreen>

View file

@ -23,6 +23,16 @@
app:title="@string/pref_game_folders"
app:summary="@string/pref_game_folders_summary"
app:iconSpaceReserved="false"/>
<Preference
android:key="games.aes_key"
app:title="@string/pref_game_crypto_keys"
app:summary="@string/pref_game_crypto_keys"
app:iconSpaceReserved="false"/>
<Preference
android:key="games.seed_db"
app:title="@string/pref_game_seed_db_keys"
app:summary="@string/pref_game_crypto_keys"
app:iconSpaceReserved="false"/>
</PreferenceCategory>
<PreferenceCategory
app:title="@string/behavior"

View file

@ -0,0 +1,288 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "gl/stream_buffer.h"
#include <array>
#include <cstdio>
#include "align.hpp"
OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
#ifdef GPU_DEBUG_INFO
if (glObjectLabel) {
glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
}
#endif
}
namespace {
// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
public:
~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
u32 Unmap(u32 used_size) override {
if (used_size == 0) return 0;
glBindBuffer(m_target, m_buffer_id);
glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
return 0;
}
u32 GetChunkSize() const override { return m_size; }
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
}
private:
BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
}
u8* m_cpu_buffer;
};
// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
public:
~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
u32 Unmap(u32 used_size) override {
if (used_size == 0) return 0;
glBindBuffer(m_target, m_buffer_id);
glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
return 0;
}
u32 GetChunkSize() const override { return m_size; }
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
glBufferData(target, size, nullptr, GL_STREAM_DRAW);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
}
private:
BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
}
u8* m_cpu_buffer;
};
// Base class for implementations which require syncing.
class SyncingStreamBuffer : public OpenGLStreamBuffer {
public:
enum : u32 { NUM_SYNC_POINTS = 16 };
virtual ~SyncingStreamBuffer() override {
for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
glDeleteSync(m_sync_objects[i]);
}
}
protected:
SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
const u32 end = GetSyncIndexForOffset(offset);
for (; m_used_block_index < end; m_used_block_index++) {
if (m_sync_objects[m_used_block_index]) {
Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
}
m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}
}
ALWAYS_INLINE void WaitForSync(GLsync& sync) {
glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
glDeleteSync(sync);
sync = nullptr;
}
ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
for (; m_available_block_index < end; m_available_block_index++) {
if (!m_sync_objects[m_available_block_index]) [[unlikely]] {
Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
}
WaitForSync(m_sync_objects[m_available_block_index]);
}
}
void AllocateSpace(u32 size) {
// add sync objects for writes since the last allocation
AddSyncsForOffset(m_position);
// wait for sync objects for the space we want to use
EnsureSyncsWaitedForOffset(m_position + size);
// wrap-around?
if ((m_position + size) > m_size) {
// current position ... buffer end
AddSyncsForOffset(m_size);
// rewind, and try again
m_position = 0;
// wait for the sync at the start of the buffer
WaitForSync(m_sync_objects[0]);
m_available_block_index = 1;
// and however much more we need to satisfy the allocation
EnsureSyncsWaitedForOffset(size);
m_used_block_index = 0;
}
}
u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
u32 m_position = 0;
u32 m_used_block_index = 0;
u32 m_available_block_index = NUM_SYNC_POINTS;
u32 m_bytes_per_block;
std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
};
class BufferStorageStreamBuffer : public SyncingStreamBuffer {
public:
~BufferStorageStreamBuffer() override {
glBindBuffer(m_target, m_buffer_id);
glUnmapBuffer(m_target);
glBindBuffer(m_target, 0);
}
MappingResult Map(u32 alignment, u32 min_size) override {
if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
AllocateSpace(min_size);
if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
}
const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
}
u32 Unmap(u32 used_size) override {
if ((m_position + used_size) > m_size) [[unlikely]] {
Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
}
if (!m_coherent) {
if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
} else {
Bind();
glFlushMappedBufferRange(m_target, m_position, used_size);
}
}
const u32 prev_position = m_position;
m_position += used_size;
return prev_position;
}
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
glGetError();
GLuint buffer_id;
glGenBuffers(1, &buffer_id);
glBindBuffer(target, buffer_id);
const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
glBufferStorage(target, size, nullptr, flags);
else if (GLAD_GL_EXT_buffer_storage)
glBufferStorageEXT(target, size, nullptr, flags);
GLenum err = glGetError();
if (err != GL_NO_ERROR) {
glBindBuffer(target, 0);
glDeleteBuffers(1, &buffer_id);
return {};
}
u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
AssertMsg(mapped_ptr, "Persistent buffer was mapped");
return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
}
private:
BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
u8* m_mapped_ptr;
bool m_coherent;
};
} // namespace
std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
std::unique_ptr<OpenGLStreamBuffer> buf;
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
buf = BufferStorageStreamBuffer::Create(target, size);
if (buf) return buf;
}
// BufferSubData is slower on all drivers except NVIDIA...
#if 0
const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
// Mali and Adreno drivers can't do sub-buffer tracking...
return BufferDataStreamBuffer::Create(target, size);
}
return BufferSubDataStreamBuffer::Create(target, size);
#else
return BufferDataStreamBuffer::Create(target, size);
#endif
}

View file

@ -0,0 +1,53 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include <glad/gl.h>
// Comment to avoid clang-format reordering the glad header
#include <memory>
#include <string_view>
#include <tuple>
#include <vector>
#include "duckstation_compat.h"
#include "helpers.hpp"
class OpenGLStreamBuffer {
public:
virtual ~OpenGLStreamBuffer();
ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
ALWAYS_INLINE u32 GetSize() const { return m_size; }
void Bind();
void Unbind();
void SetDebugName(std::string_view name);
struct MappingResult {
void* pointer;
u32 buffer_offset;
u32 index_aligned; // offset / alignment, suitable for base vertex
u32 space_aligned; // remaining space / alignment
};
virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
/// Returns the position in the buffer *before* the start of used_size.
virtual u32 Unmap(u32 used_size) = 0;
/// Returns the minimum granularity of blocks which sync objects will be created around.
virtual u32 GetChunkSize() const = 0;
static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
protected:
OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
GLenum m_target;
GLuint m_buffer_id;
u32 m_size;
};

1
third_party/fmt vendored Submodule

@ -0,0 +1 @@
Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4