mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-06-03 12:27:21 +12:00
Merge branch 'master' into metal2
This commit is contained in:
commit
02f8250aff
69 changed files with 2906 additions and 319 deletions
45
include/PICA/draw_acceleration.hpp
Normal file
45
include/PICA/draw_acceleration.hpp
Normal file
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "helpers.hpp"
|
||||
|
||||
namespace PICA {
|
||||
struct DrawAcceleration {
|
||||
static constexpr u32 maxAttribCount = 16;
|
||||
static constexpr u32 maxLoaderCount = 12;
|
||||
|
||||
struct AttributeInfo {
|
||||
u32 offset;
|
||||
u32 stride;
|
||||
|
||||
u8 type;
|
||||
u8 componentCount;
|
||||
|
||||
std::array<float, 4> fixedValue; // For fixed attributes
|
||||
};
|
||||
|
||||
struct Loader {
|
||||
// Data to upload for this loader
|
||||
u8* data;
|
||||
usize size;
|
||||
};
|
||||
|
||||
u8* indexBuffer;
|
||||
|
||||
// Minimum and maximum index in the index buffer for a draw call
|
||||
u16 minimumIndex, maximumIndex;
|
||||
u32 totalAttribCount;
|
||||
u32 totalLoaderCount;
|
||||
u32 enabledAttributeMask;
|
||||
u32 fixedAttributes;
|
||||
u32 vertexDataSize;
|
||||
|
||||
std::array<AttributeInfo, maxAttribCount> attributeInfo;
|
||||
std::array<Loader, maxLoaderCount> loaders;
|
||||
|
||||
bool canBeAccelerated;
|
||||
bool indexed;
|
||||
bool useShortIndices;
|
||||
};
|
||||
} // namespace PICA
|
|
@ -2,7 +2,7 @@
|
|||
#include "helpers.hpp"
|
||||
#include "vertex_loader_rec.hpp"
|
||||
|
||||
// Common file for our PICA JITs (From vertex config -> CPU assembly and from PICA shader -> CPU assembly)
|
||||
// Common file for our PICA JITs (From PICA shader -> CPU assembly)
|
||||
|
||||
namespace Dynapica {
|
||||
#ifdef PANDA3DS_DYNAPICA_SUPPORTED
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
|
||||
#include "PICA/draw_acceleration.hpp"
|
||||
#include "PICA/dynapica/shader_rec.hpp"
|
||||
#include "PICA/float_types.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
|
@ -13,6 +14,12 @@
|
|||
#include "memory.hpp"
|
||||
#include "renderer.hpp"
|
||||
|
||||
enum class ShaderExecMode {
|
||||
Interpreter, // Interpret shaders on the CPU
|
||||
JIT, // Recompile shaders to CPU machine code
|
||||
Hardware, // Recompiler shaders to host shaders and run them on the GPU
|
||||
};
|
||||
|
||||
class GPU {
|
||||
static constexpr u32 regNum = 0x300;
|
||||
static constexpr u32 extRegNum = 0x1000;
|
||||
|
@ -45,7 +52,7 @@ class GPU {
|
|||
uint immediateModeVertIndex;
|
||||
uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading
|
||||
|
||||
template <bool indexed, bool useShaderJIT>
|
||||
template <bool indexed, ShaderExecMode mode>
|
||||
void drawArrays();
|
||||
|
||||
// Silly method of avoiding linking problems. TODO: Change to something less silly
|
||||
|
@ -81,6 +88,7 @@ class GPU {
|
|||
std::unique_ptr<Renderer> renderer;
|
||||
PICA::Vertex getImmediateModeVertex();
|
||||
|
||||
void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
|
||||
public:
|
||||
// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
|
||||
// Encoded in PICA native format
|
||||
|
|
|
@ -17,6 +17,7 @@ namespace PICA {
|
|||
// enable == off means a CompareFunction of Always
|
||||
BitField<0, 3, CompareFunction> alphaTestFunction;
|
||||
BitField<3, 1, u32> depthMapEnable;
|
||||
BitField<4, 4, LogicOpMode> logicOpMode;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -214,6 +215,10 @@ namespace PICA {
|
|||
(alphaTestConfig & 1) ? static_cast<PICA::CompareFunction>(alphaTestFunction) : PICA::CompareFunction::Always;
|
||||
outConfig.depthMapEnable = regs[InternalRegs::DepthmapEnable] & 1;
|
||||
|
||||
// Shows if blending is enabled. If it is not enabled, then logic ops are enabled instead
|
||||
const bool blendingEnabled = (regs[InternalRegs::ColourOperation] & (1 << 8)) != 0;
|
||||
outConfig.logicOpMode = blendingEnabled ? LogicOpMode::Copy : LogicOpMode(Helpers::getBits<0, 4>(regs[InternalRegs::LogicOp]));
|
||||
|
||||
texConfig.texUnitConfig = regs[InternalRegs::TexUnitCfg];
|
||||
texConfig.texEnvUpdateBuffer = regs[InternalRegs::TexEnvUpdateBuffer];
|
||||
|
||||
|
|
274
include/PICA/pica_simd.hpp
Normal file
274
include/PICA/pica_simd.hpp
Normal file
|
@ -0,0 +1,274 @@
|
|||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "helpers.hpp"
|
||||
|
||||
#if defined(_M_AMD64) || defined(__x86_64__)
|
||||
#define PICA_SIMD_X64
|
||||
#include <immintrin.h>
|
||||
#elif defined(_M_ARM64) || defined(__aarch64__)
|
||||
#define PICA_SIMD_ARM64
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
|
||||
namespace PICA::IndexBuffer {
|
||||
// Non-SIMD, portable algorithm
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
|
||||
u16 minimumIndex = std::numeric_limits<u16>::max();
|
||||
u16 maximumIndex = 0;
|
||||
|
||||
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
|
||||
if constexpr (useShortIndices) {
|
||||
u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
|
||||
|
||||
for (u32 i = 0; i < vertexCount; i++) {
|
||||
u16 index = indexBuffer16[i];
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
} else {
|
||||
for (u32 i = 0; i < vertexCount; i++) {
|
||||
u16 index = u16(indexBuffer[i]);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
}
|
||||
|
||||
return {minimumIndex, maximumIndex};
|
||||
}
|
||||
|
||||
#ifdef PICA_SIMD_ARM64
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
|
||||
// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
|
||||
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
|
||||
|
||||
if (vertexCount < vertsPerLoop) {
|
||||
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
|
||||
}
|
||||
|
||||
u16 minimumIndex, maximumIndex;
|
||||
|
||||
if constexpr (useShortIndices) {
|
||||
// 16-bit indices
|
||||
uint16x8_t minima = vdupq_n_u16(0xffff);
|
||||
uint16x8_t maxima = vdupq_n_u16(0);
|
||||
|
||||
while (vertexCount >= vertsPerLoop) {
|
||||
const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
|
||||
minima = vminq_u16(data, minima);
|
||||
maxima = vmaxq_u16(data, maxima);
|
||||
|
||||
indexBuffer += 16;
|
||||
vertexCount -= vertsPerLoop;
|
||||
}
|
||||
|
||||
// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
|
||||
// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
|
||||
// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
|
||||
// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
|
||||
|
||||
uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
|
||||
uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
|
||||
uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
|
||||
|
||||
uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
|
||||
uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
|
||||
uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
|
||||
|
||||
minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
|
||||
maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
|
||||
} else {
|
||||
// 8-bit indices
|
||||
uint8x16_t minima = vdupq_n_u8(0xff);
|
||||
uint8x16_t maxima = vdupq_n_u8(0);
|
||||
|
||||
while (vertexCount >= vertsPerLoop) {
|
||||
uint8x16_t data = vld1q_u8(indexBuffer);
|
||||
minima = vminq_u8(data, minima);
|
||||
maxima = vmaxq_u8(data, maxima);
|
||||
|
||||
indexBuffer += 16;
|
||||
vertexCount -= vertsPerLoop;
|
||||
}
|
||||
|
||||
// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
|
||||
uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
|
||||
uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
|
||||
uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
|
||||
uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
|
||||
|
||||
uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
|
||||
uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
|
||||
uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
|
||||
uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
|
||||
|
||||
minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
|
||||
maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
|
||||
}
|
||||
|
||||
// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
|
||||
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
|
||||
while (vertexCount > 0) {
|
||||
if constexpr (useShortIndices) {
|
||||
u16 index = *reinterpret_cast<u16*>(indexBuffer);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
indexBuffer += 2;
|
||||
} else {
|
||||
u16 index = u16(*indexBuffer++);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
|
||||
vertexCount -= 1;
|
||||
}
|
||||
|
||||
return {minimumIndex, maximumIndex};
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
|
||||
// We process 16 bytes per iteration, which is 8 vertices if we're using u16
|
||||
// indices or 16 vertices if we're using u8 indices
|
||||
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
|
||||
|
||||
if (vertexCount < vertsPerLoop) {
|
||||
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
|
||||
}
|
||||
|
||||
u16 minimumIndex, maximumIndex;
|
||||
|
||||
if constexpr (useShortIndices) {
|
||||
// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
|
||||
// Based on https://stackoverflow.com/a/22259607
|
||||
auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
|
||||
|
||||
auto horizontalMax16 = [](__m128i vector) -> u16 {
|
||||
// We have an instruction to compute horizontal minimum but not maximum, so we use it.
|
||||
// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
|
||||
__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
|
||||
u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
|
||||
return u16(min ^ 0xffff);
|
||||
};
|
||||
|
||||
// 16-bit indices
|
||||
// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
|
||||
// And the maxima vector to all 0s (0 for each 16-bit lane)
|
||||
__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
|
||||
__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
while (vertexCount >= vertsPerLoop) {
|
||||
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
|
||||
minima = _mm_min_epu16(data, minima);
|
||||
maxima = _mm_max_epu16(data, maxima);
|
||||
|
||||
indexBuffer += 16;
|
||||
vertexCount -= vertsPerLoop;
|
||||
}
|
||||
|
||||
minimumIndex = u16(horizontalMin16(minima));
|
||||
maximumIndex = u16(horizontalMax16(maxima));
|
||||
} else {
|
||||
// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
|
||||
// Based on https://stackoverflow.com/a/22259607
|
||||
auto horizontalMin8 = [](__m128i vector) -> u8 {
|
||||
vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
|
||||
return u8(_mm_cvtsi128_si32(vector));
|
||||
};
|
||||
|
||||
auto horizontalMax8 = [](__m128i vector) -> u8 {
|
||||
vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
|
||||
return u8(_mm_cvtsi128_si32(vector));
|
||||
};
|
||||
|
||||
// 8-bit indices
|
||||
// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
|
||||
// And the maxima vector to all 0s (0 for each 8-bit lane)
|
||||
__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
|
||||
__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
while (vertexCount >= vertsPerLoop) {
|
||||
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
|
||||
minima = _mm_min_epu8(data, minima);
|
||||
maxima = _mm_max_epu8(data, maxima);
|
||||
|
||||
indexBuffer += 16;
|
||||
vertexCount -= vertsPerLoop;
|
||||
}
|
||||
|
||||
minimumIndex = u16(horizontalMin8(minima));
|
||||
maximumIndex = u16(horizontalMax8(maxima));
|
||||
}
|
||||
|
||||
// If any indices could not be processed cause the buffer size
|
||||
// is not 16-byte aligned, process them the naive way
|
||||
// Calculate the minimum and maximum indices used in the index
|
||||
// buffer, so we'll only upload them
|
||||
while (vertexCount > 0) {
|
||||
if constexpr (useShortIndices) {
|
||||
u16 index = *reinterpret_cast<u16*>(indexBuffer);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
indexBuffer += 2;
|
||||
} else {
|
||||
u16 index = u16(*indexBuffer++);
|
||||
minimumIndex = std::min(minimumIndex, index);
|
||||
maximumIndex = std::max(maximumIndex, index);
|
||||
}
|
||||
|
||||
vertexCount -= 1;
|
||||
}
|
||||
|
||||
return {minimumIndex, maximumIndex};
|
||||
}
|
||||
#endif
|
||||
|
||||
// Analyzes a PICA index buffer to get the minimum and maximum indices in the
|
||||
// buffer, and returns them in a pair in the form [min, max]. Takes a template
|
||||
// parameter to decide whether the indices in the buffer are u8 or u16
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
|
||||
#if defined(PICA_SIMD_ARM64)
|
||||
return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
|
||||
#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
|
||||
// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
|
||||
return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
|
||||
#else
|
||||
return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
|
||||
#endif
|
||||
}
|
||||
|
||||
// In some really unfortunate scenarios (eg Android Studio emulator), we don't have access to glDrawRangeElementsBaseVertex
|
||||
// So we need to subtract the base vertex index from every index in the index buffer ourselves
|
||||
// This is not really common, so we do it without SIMD for the moment, just to be able to run on Android Studio
|
||||
template <bool useShortIndices>
|
||||
void subtractBaseIndex(u8* indexBuffer, u32 indexCount, u16 baseIndex) {
|
||||
// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
|
||||
if constexpr (useShortIndices) {
|
||||
u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
|
||||
|
||||
for (u32 i = 0; i < indexCount; i++) {
|
||||
indexBuffer16[i] -= baseIndex;
|
||||
}
|
||||
} else {
|
||||
u8 baseIndex8 = u8(baseIndex);
|
||||
|
||||
for (u32 i = 0; i < indexCount; i++) {
|
||||
indexBuffer[i] -= baseIndex8;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace PICA::IndexBuffer
|
57
include/PICA/pica_vert_config.hpp
Normal file
57
include/PICA/pica_vert_config.hpp
Normal file
|
@ -0,0 +1,57 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "PICA/pica_hash.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader.hpp"
|
||||
#include "bitfield.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
||||
namespace PICA {
|
||||
// Configuration struct used
|
||||
struct VertConfig {
|
||||
PICAHash::HashType shaderHash;
|
||||
PICAHash::HashType opdescHash;
|
||||
u32 entrypoint;
|
||||
|
||||
// PICA registers for configuring shader output->fragment semantic mapping
|
||||
std::array<u32, 7> outmaps{};
|
||||
u16 outputMask;
|
||||
u8 outputCount;
|
||||
bool usingUbershader;
|
||||
|
||||
// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
|
||||
// As the padding will get hashed and memcmp'd...
|
||||
u32 pad{};
|
||||
|
||||
bool operator==(const VertConfig& config) const {
|
||||
// Hash function and equality operator required by std::unordered_map
|
||||
return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
|
||||
}
|
||||
|
||||
VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
|
||||
shaderHash = shader.getCodeHash();
|
||||
opdescHash = shader.getOpdescHash();
|
||||
entrypoint = shader.entrypoint;
|
||||
|
||||
outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
|
||||
outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
|
||||
for (int i = 0; i < outputCount; i++) {
|
||||
// Mask out unused bits
|
||||
outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace PICA
|
||||
|
||||
static_assert(sizeof(PICA::VertConfig) == 56);
|
||||
|
||||
// Override std::hash for our vertex config class
|
||||
template <>
|
||||
struct std::hash<PICA::VertConfig> {
|
||||
std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
|
||||
};
|
|
@ -396,6 +396,25 @@ namespace PICA {
|
|||
GreaterOrEqual = 7,
|
||||
};
|
||||
|
||||
enum class LogicOpMode : u32 {
|
||||
Clear = 0,
|
||||
And = 1,
|
||||
ReverseAnd = 2,
|
||||
Copy = 3,
|
||||
Set = 4,
|
||||
InvertedCopy = 5,
|
||||
Nop = 6,
|
||||
Invert = 7,
|
||||
Nand = 8,
|
||||
Or = 9,
|
||||
Nor = 10,
|
||||
Xor = 11,
|
||||
Equiv = 12,
|
||||
InvertedAnd = 13,
|
||||
ReverseOr = 14,
|
||||
InvertedOr = 15,
|
||||
};
|
||||
|
||||
enum class FogMode : u32 {
|
||||
Disabled = 0,
|
||||
Fog = 5,
|
||||
|
|
|
@ -107,6 +107,11 @@ class PICAShader {
|
|||
alignas(16) std::array<vec4f, 16> inputs; // Attributes passed to the shader
|
||||
alignas(16) std::array<vec4f, 16> outputs;
|
||||
alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT
|
||||
|
||||
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
|
||||
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
|
||||
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
|
||||
using Hash = PICAHash::HashType;
|
||||
|
||||
protected:
|
||||
std::array<u32, 128> operandDescriptors;
|
||||
|
@ -125,14 +130,13 @@ class PICAShader {
|
|||
std::array<CallInfo, 4> callInfo;
|
||||
ShaderType type;
|
||||
|
||||
// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
|
||||
// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
|
||||
// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
|
||||
using Hash = PICAHash::HashType;
|
||||
|
||||
Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism)
|
||||
Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT)
|
||||
|
||||
public:
|
||||
bool uniformsDirty = false;
|
||||
|
||||
protected:
|
||||
bool codeHashDirty = false;
|
||||
bool opdescHashDirty = false;
|
||||
|
||||
|
@ -284,6 +288,7 @@ class PICAShader {
|
|||
uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
|
||||
uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
|
||||
}
|
||||
uniformsDirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -295,6 +300,12 @@ class PICAShader {
|
|||
u[1] = getBits<8, 8>(word);
|
||||
u[2] = getBits<16, 8>(word);
|
||||
u[3] = getBits<24, 8>(word);
|
||||
uniformsDirty = true;
|
||||
}
|
||||
|
||||
void uploadBoolUniform(u32 value) {
|
||||
boolUniform = value;
|
||||
uniformsDirty = true;
|
||||
}
|
||||
|
||||
void run();
|
||||
|
@ -302,6 +313,10 @@ class PICAShader {
|
|||
|
||||
Hash getCodeHash();
|
||||
Hash getOpdescHash();
|
||||
|
||||
// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
|
||||
static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
|
||||
void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
|
||||
};
|
||||
|
||||
static_assert(
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
#pragma once
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "PICA/shader.hpp"
|
||||
|
@ -41,9 +44,12 @@ namespace PICA::ShaderGen {
|
|||
explicit Function(u32 start, u32 end) : start(start), end(end) {}
|
||||
bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
|
||||
|
||||
std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
|
||||
std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
|
||||
std::string getCallStatement() const { return getIdentifier() + "()"; }
|
||||
std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
|
||||
// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
|
||||
// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
|
||||
// from within functions deep in the callstack
|
||||
std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
|
||||
std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
|
||||
};
|
||||
|
||||
std::set<Function> functions{};
|
||||
|
@ -93,9 +99,11 @@ namespace PICA::ShaderGen {
|
|||
|
||||
API api;
|
||||
Language language;
|
||||
bool compilationError = false;
|
||||
|
||||
void compileInstruction(u32& pc, bool& finished);
|
||||
void compileRange(const AddressRange& range);
|
||||
// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
|
||||
std::pair<u32, bool> compileRange(const AddressRange& range);
|
||||
void callFunction(const Function& function);
|
||||
const Function* findFunction(const AddressRange& range);
|
||||
|
||||
|
@ -105,6 +113,7 @@ namespace PICA::ShaderGen {
|
|||
std::string getDest(u32 dest) const;
|
||||
std::string getSwizzlePattern(u32 swizzle) const;
|
||||
std::string getDestSwizzle(u32 destinationMask) const;
|
||||
const char* getCondition(u32 cond, u32 refX, u32 refY);
|
||||
|
||||
void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
|
||||
// Returns if the instruction uses the typical register encodings most instructions use
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "PICA/gpu.hpp"
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/pica_vert_config.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen_types.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -25,11 +26,14 @@ namespace PICA::ShaderGen {
|
|||
bool isSamplerEnabled(u32 environmentID, u32 lutID);
|
||||
|
||||
void compileFog(std::string& shader, const PICA::FragmentConfig& config);
|
||||
void compileLogicOps(std::string& shader, const PICA::FragmentConfig& config);
|
||||
|
||||
public:
|
||||
FragmentGenerator(API api, Language language) : api(api), language(language) {}
|
||||
std::string generate(const PICA::FragmentConfig& config);
|
||||
std::string generate(const PICA::FragmentConfig& config, void* driverInfo = nullptr);
|
||||
std::string getDefaultVertexShader();
|
||||
// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
|
||||
std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
|
||||
|
||||
void setTarget(API api, Language language) {
|
||||
this->api = api;
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
#include "PICA/shader.hpp"
|
||||
|
||||
class ShaderUnit {
|
||||
|
||||
public:
|
||||
PICAShader vs; // Vertex shader
|
||||
PICAShader gs; // Geometry shader
|
||||
public:
|
||||
PICAShader vs; // Vertex shader
|
||||
PICAShader gs; // Geometry shader
|
||||
|
||||
ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
|
||||
void reset();
|
||||
|
|
99
include/align.hpp
Normal file
99
include/align.hpp
Normal file
|
@ -0,0 +1,99 @@
|
|||
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
|
||||
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "helpers.hpp"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
namespace Common {
|
||||
template <typename T>
|
||||
constexpr bool isAligned(T value, unsigned int alignment) {
|
||||
return (value % static_cast<T>(alignment)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignUp(T value, unsigned int alignment) {
|
||||
return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignDown(T value, unsigned int alignment) {
|
||||
return value / static_cast<T>(alignment) * static_cast<T>(alignment);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool isAlignedPow2(T value, unsigned int alignment) {
|
||||
return (value & static_cast<T>(alignment - 1)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignUpPow2(T value, unsigned int alignment) {
|
||||
return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T alignDownPow2(T value, unsigned int alignment) {
|
||||
return value & static_cast<T>(~static_cast<T>(alignment - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool isPow2(T value) {
|
||||
return (value & (value - 1)) == 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T previousPow2(T value) {
|
||||
if (value == static_cast<T>(0)) return 0;
|
||||
|
||||
value |= (value >> 1);
|
||||
value |= (value >> 2);
|
||||
value |= (value >> 4);
|
||||
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
|
||||
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
|
||||
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
|
||||
return value - (value >> 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T nextPow2(T value) {
|
||||
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
|
||||
if (value == static_cast<T>(0)) return 0;
|
||||
|
||||
value--;
|
||||
value |= (value >> 1);
|
||||
value |= (value >> 2);
|
||||
value |= (value >> 4);
|
||||
if constexpr (sizeof(T) >= 16) value |= (value >> 8);
|
||||
if constexpr (sizeof(T) >= 32) value |= (value >> 16);
|
||||
if constexpr (sizeof(T) >= 64) value |= (value >> 32);
|
||||
value++;
|
||||
return value;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(size, alignment);
|
||||
#else
|
||||
// Unaligned sizes are slow on macOS.
|
||||
#ifdef __APPLE__
|
||||
if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
|
||||
#endif
|
||||
void* ret = nullptr;
|
||||
return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void alignedFree(void* ptr) {
|
||||
#ifdef _MSC_VER
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
} // namespace Common
|
|
@ -324,8 +324,8 @@ namespace Audio::HLE {
|
|||
BitField<15, 1, u32> outputBufferCountDirty;
|
||||
BitField<16, 1, u32> masterVolumeDirty;
|
||||
|
||||
BitField<24, 1, u32> auxReturnVolume0Dirty;
|
||||
BitField<25, 1, u32> auxReturnVolume1Dirty;
|
||||
BitField<24, 1, u32> auxVolume0Dirty;
|
||||
BitField<25, 1, u32> auxVolume1Dirty;
|
||||
BitField<26, 1, u32> outputFormatDirty;
|
||||
BitField<27, 1, u32> clippingModeDirty;
|
||||
BitField<28, 1, u32> headphonesConnectedDirty;
|
||||
|
@ -337,7 +337,7 @@ namespace Audio::HLE {
|
|||
/// The DSP has three intermediate audio mixers. This controls the volume level (0.0-1.0) for
|
||||
/// each at the final mixer.
|
||||
float_le masterVolume;
|
||||
std::array<float_le, 2> auxReturnVolume;
|
||||
std::array<float_le, 2> auxVolumes;
|
||||
|
||||
u16_le outputBufferCount;
|
||||
u16 pad1[2];
|
||||
|
@ -422,7 +422,7 @@ namespace Audio::HLE {
|
|||
|
||||
struct DspStatus {
|
||||
u16_le unknown;
|
||||
u16_le dropped_frames;
|
||||
u16_le droppedFrames;
|
||||
u16 pad0[0xE];
|
||||
};
|
||||
ASSERT_DSP_STRUCT(DspStatus, 32);
|
||||
|
|
|
@ -95,8 +95,7 @@ namespace Audio {
|
|||
DSPSource() { reset(); }
|
||||
};
|
||||
|
||||
class HLE_DSP : public DSPCore {
|
||||
// The audio frame types are public in case we want to use them for unit tests
|
||||
class DSPMixer {
|
||||
public:
|
||||
template <typename T, usize channelCount = 1>
|
||||
using Sample = std::array<T, channelCount>;
|
||||
|
@ -113,6 +112,43 @@ namespace Audio {
|
|||
template <typename T>
|
||||
using QuadFrame = Frame<T, 4>;
|
||||
|
||||
private:
|
||||
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
|
||||
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
|
||||
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
|
||||
static constexpr usize mixerStageCount = 3;
|
||||
|
||||
public:
|
||||
ChannelFormat channelFormat = ChannelFormat::Stereo;
|
||||
std::array<float, mixerStageCount> volumes;
|
||||
std::array<bool, 2> enableAuxStages;
|
||||
|
||||
void reset() {
|
||||
channelFormat = ChannelFormat::Stereo;
|
||||
|
||||
volumes.fill(0.0);
|
||||
enableAuxStages.fill(false);
|
||||
}
|
||||
};
|
||||
|
||||
class HLE_DSP : public DSPCore {
|
||||
// The audio frame types are public in case we want to use them for unit tests
|
||||
public:
|
||||
template <typename T, usize channelCount = 1>
|
||||
using Sample = DSPMixer::Sample<T, channelCount>;
|
||||
|
||||
template <typename T, usize channelCount>
|
||||
using Frame = DSPMixer::Frame<T, channelCount>;
|
||||
|
||||
template <typename T>
|
||||
using MonoFrame = DSPMixer::MonoFrame<T>;
|
||||
|
||||
template <typename T>
|
||||
using StereoFrame = DSPMixer::StereoFrame<T>;
|
||||
|
||||
template <typename T>
|
||||
using QuadFrame = DSPMixer::QuadFrame<T>;
|
||||
|
||||
using Source = Audio::DSPSource;
|
||||
using SampleBuffer = Source::SampleBuffer;
|
||||
|
||||
|
@ -131,6 +167,7 @@ namespace Audio {
|
|||
std::array<Source, Audio::HLE::sourceCount> sources; // DSP voices
|
||||
Audio::HLE::DspMemory dspRam;
|
||||
|
||||
Audio::DSPMixer mixer;
|
||||
std::unique_ptr<Audio::AAC::Decoder> aacDecoder;
|
||||
|
||||
void resetAudioPipe();
|
||||
|
@ -175,10 +212,13 @@ namespace Audio {
|
|||
|
||||
void handleAACRequest(const AAC::Message& request);
|
||||
void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
|
||||
void updateMixerConfig(HLE::SharedMemory& sharedMem);
|
||||
void generateFrame(StereoFrame<s16>& frame);
|
||||
void generateFrame(DSPSource& source);
|
||||
void outputFrame();
|
||||
|
||||
// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
|
||||
void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
|
||||
|
||||
// Decode an entire buffer worth of audio
|
||||
void decodeBuffer(DSPSource& source);
|
||||
|
||||
|
|
|
@ -20,18 +20,20 @@ struct EmulatorConfig {
|
|||
#else
|
||||
static constexpr bool ubershaderDefault = true;
|
||||
#endif
|
||||
|
||||
static constexpr bool accelerateShadersDefault = true;
|
||||
|
||||
bool shaderJitEnabled = shaderJitDefault;
|
||||
bool discordRpcEnabled = false;
|
||||
bool useUbershaders = ubershaderDefault;
|
||||
bool accelerateShaders = accelerateShadersDefault;
|
||||
bool accurateShaderMul = false;
|
||||
bool discordRpcEnabled = false;
|
||||
|
||||
// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
|
||||
bool forceShadergenForLights = true;
|
||||
int lightShadergenThreshold = 1;
|
||||
|
||||
RendererType rendererType = RendererType::OpenGL;
|
||||
Audio::DSPCore::Type dspType = Audio::DSPCore::Type::Null;
|
||||
Audio::DSPCore::Type dspType = Audio::DSPCore::Type::HLE;
|
||||
|
||||
bool sdCardInserted = true;
|
||||
bool sdWriteProtected = false;
|
||||
|
|
|
@ -298,5 +298,5 @@ private:
|
|||
|
||||
bool allocateMainThreadStack(u32 size);
|
||||
Regions getConsoleRegion();
|
||||
void copySharedFont(u8* ptr);
|
||||
void copySharedFont(u8* ptr, u32 vaddr);
|
||||
};
|
||||
|
|
|
@ -35,4 +35,35 @@ namespace Renderdoc {
|
|||
static void setOutputDir(const std::string& path, const std::string& prefix) {}
|
||||
static constexpr bool isSupported() { return false; }
|
||||
} // namespace Renderdoc
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace Renderdoc {
|
||||
// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
|
||||
struct Scope {
|
||||
Scope() { Renderdoc::startCapture(); }
|
||||
~Scope() { Renderdoc::endCapture(); }
|
||||
|
||||
Scope(const Scope&) = delete;
|
||||
Scope& operator=(const Scope&) = delete;
|
||||
|
||||
Scope(Scope&&) = delete;
|
||||
Scope& operator=(const Scope&&) = delete;
|
||||
};
|
||||
|
||||
// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
|
||||
// trigger on its own and take a capture
|
||||
struct InstantScope {
|
||||
InstantScope() {
|
||||
Renderdoc::triggerCapture();
|
||||
Renderdoc::startCapture();
|
||||
}
|
||||
|
||||
~InstantScope() { Renderdoc::endCapture(); }
|
||||
|
||||
InstantScope(const InstantScope&) = delete;
|
||||
InstantScope& operator=(const InstantScope&) = delete;
|
||||
|
||||
InstantScope(InstantScope&&) = delete;
|
||||
InstantScope& operator=(const InstantScope&&) = delete;
|
||||
};
|
||||
} // namespace Renderdoc
|
|
@ -1,9 +1,10 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <string>
|
||||
#include <optional>
|
||||
|
||||
#include "PICA/draw_acceleration.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
@ -22,9 +23,11 @@ enum class RendererType : s8 {
|
|||
};
|
||||
|
||||
struct EmulatorConfig;
|
||||
class GPU;
|
||||
struct SDL_Window;
|
||||
|
||||
class GPU;
|
||||
class ShaderUnit;
|
||||
|
||||
class Renderer {
|
||||
protected:
|
||||
GPU& gpu;
|
||||
|
@ -78,7 +81,11 @@ class Renderer {
|
|||
virtual std::string getUbershader() { return ""; }
|
||||
virtual void setUbershader(const std::string& shader) {}
|
||||
|
||||
virtual void setUbershaderSetting(bool value) {}
|
||||
// This function is called on every draw call before parsing vertex data.
|
||||
// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
|
||||
// ubershaders and shadergen, and so on.
|
||||
// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
|
||||
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
|
||||
|
||||
// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
|
||||
#ifdef PANDA3DS_FRONTEND_QT
|
||||
|
|
12
include/renderer_gl/gl_driver.hpp
Normal file
12
include/renderer_gl/gl_driver.hpp
Normal file
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
// Information about our OpenGL/OpenGL ES driver that we should keep track of
|
||||
// Stuff like whether specific extensions are supported, and potentially things like OpenGL context information
|
||||
namespace OpenGL {
|
||||
struct Driver {
|
||||
bool supportsExtFbFetch = false;
|
||||
bool supportsArmFbFetch = false;
|
||||
|
||||
bool supportFbFetch() const { return supportsExtFbFetch || supportsArmFbFetch; }
|
||||
};
|
||||
} // namespace OpenGL
|
|
@ -38,7 +38,6 @@ struct GLStateManager {
|
|||
|
||||
GLuint stencilMask;
|
||||
GLuint boundVAO;
|
||||
GLuint boundVBO;
|
||||
GLuint currentProgram;
|
||||
GLuint boundUBO;
|
||||
|
||||
|
@ -173,13 +172,6 @@ struct GLStateManager {
|
|||
}
|
||||
}
|
||||
|
||||
void bindVBO(GLuint handle) {
|
||||
if (boundVBO != handle) {
|
||||
boundVBO = handle;
|
||||
glBindBuffer(GL_ARRAY_BUFFER, handle);
|
||||
}
|
||||
}
|
||||
|
||||
void useProgram(GLuint handle) {
|
||||
if (currentProgram != handle) {
|
||||
currentProgram = handle;
|
||||
|
@ -195,7 +187,6 @@ struct GLStateManager {
|
|||
}
|
||||
|
||||
void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
|
||||
void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
|
||||
void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }
|
||||
|
||||
void setColourMask(bool r, bool g, bool b, bool a) {
|
||||
|
|
|
@ -3,15 +3,21 @@
|
|||
#include <array>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "PICA/float_types.hpp"
|
||||
#include "PICA/pica_frag_config.hpp"
|
||||
#include "PICA/pica_hash.hpp"
|
||||
#include "PICA/pica_vert_config.hpp"
|
||||
#include "PICA/pica_vertex.hpp"
|
||||
#include "PICA/regs.hpp"
|
||||
#include "PICA/shader_gen.hpp"
|
||||
#include "gl/stream_buffer.h"
|
||||
#include "gl_driver.hpp"
|
||||
#include "gl_state.hpp"
|
||||
#include "helpers.hpp"
|
||||
#include "logger.hpp"
|
||||
|
@ -28,9 +34,11 @@ class RendererGL final : public Renderer {
|
|||
OpenGL::Program triangleProgram;
|
||||
OpenGL::Program displayProgram;
|
||||
|
||||
OpenGL::VertexArray vao;
|
||||
// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
|
||||
OpenGL::VertexArray defaultVAO;
|
||||
// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
|
||||
OpenGL::VertexArray hwShaderVAO;
|
||||
OpenGL::VertexBuffer vbo;
|
||||
bool enableUbershader = true;
|
||||
|
||||
// Data
|
||||
struct {
|
||||
|
@ -53,6 +61,21 @@ class RendererGL final : public Renderer {
|
|||
float oldDepthScale = -1.0;
|
||||
float oldDepthOffset = 0.0;
|
||||
bool oldDepthmapEnable = false;
|
||||
// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
|
||||
bool usingAcceleratedShader = false;
|
||||
bool performIndexedRender = false;
|
||||
bool usingShortIndices = false;
|
||||
|
||||
// Set by prepareForDraw, metadata for indexed renders
|
||||
GLuint minimumIndex = 0;
|
||||
GLuint maximumIndex = 0;
|
||||
void* hwIndexBufferOffset = nullptr;
|
||||
|
||||
// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
|
||||
u32 previousAttributeMask = 0;
|
||||
|
||||
// Cached pointer to the current vertex shader when using HW accelerated shaders
|
||||
OpenGL::Shader* generatedVertexShader = nullptr;
|
||||
|
||||
SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
|
||||
SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
|
||||
|
@ -70,18 +93,58 @@ class RendererGL final : public Renderer {
|
|||
// We can compile this once and then link it with all other generated fragment shaders
|
||||
OpenGL::Shader defaultShadergenVs;
|
||||
GLuint shadergenFragmentUBO;
|
||||
// UBO for uploading the PICA uniforms when using hw shaders
|
||||
GLuint hwShaderUniformUBO;
|
||||
|
||||
using StreamBuffer = OpenGLStreamBuffer;
|
||||
std::unique_ptr<StreamBuffer> hwVertexBuffer;
|
||||
std::unique_ptr<StreamBuffer> hwIndexBuffer;
|
||||
|
||||
// Cache of fixed attribute values so that we don't do any duplicate updates
|
||||
std::array<std::array<float, 4>, 16> fixedAttrValues;
|
||||
|
||||
// Cached recompiled fragment shader
|
||||
struct CachedProgram {
|
||||
OpenGL::Program program;
|
||||
};
|
||||
std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
|
||||
|
||||
struct ShaderCache {
|
||||
std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
|
||||
std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
|
||||
|
||||
// Program cache indexed by GLuints for the vertex and fragment shader to use
|
||||
// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
|
||||
std::unordered_map<u64, CachedProgram> programCache;
|
||||
|
||||
void clear() {
|
||||
for (auto& it : programCache) {
|
||||
CachedProgram& cachedProgram = it.second;
|
||||
cachedProgram.program.free();
|
||||
}
|
||||
|
||||
for (auto& it : vertexShaderCache) {
|
||||
if (it.second.has_value()) {
|
||||
it.second->free();
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& it : fragmentShaderCache) {
|
||||
it.second.free();
|
||||
}
|
||||
|
||||
programCache.clear();
|
||||
vertexShaderCache.clear();
|
||||
fragmentShaderCache.clear();
|
||||
}
|
||||
};
|
||||
ShaderCache shaderCache;
|
||||
|
||||
OpenGL::Framebuffer getColourFBO();
|
||||
OpenGL::Texture getTexture(Texture& tex);
|
||||
OpenGL::Program& getSpecializedShader();
|
||||
|
||||
PICA::ShaderGen::FragmentGenerator fragShaderGen;
|
||||
OpenGL::Driver driverInfo;
|
||||
|
||||
MAKE_LOG_FUNCTION(log, rendererLogger)
|
||||
void setupBlending();
|
||||
|
@ -93,6 +156,8 @@ class RendererGL final : public Renderer {
|
|||
void updateFogLUT();
|
||||
void initGraphicsContextInternal();
|
||||
|
||||
void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
|
||||
|
||||
public:
|
||||
RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
|
||||
: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
|
||||
|
@ -110,15 +175,13 @@ class RendererGL final : public Renderer {
|
|||
virtual bool supportsShaderReload() override { return true; }
|
||||
virtual std::string getUbershader() override;
|
||||
virtual void setUbershader(const std::string& shader) override;
|
||||
|
||||
virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
|
||||
virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
|
||||
|
||||
std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
|
||||
|
||||
// Note: The caller is responsible for deleting the currently bound FBO before calling this
|
||||
void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
|
||||
void resetStateManager() { gl.reset(); }
|
||||
void clearShaderCache();
|
||||
void initUbershader(OpenGL::Program& program);
|
||||
|
||||
#ifdef PANDA3DS_FRONTEND_QT
|
||||
|
|
|
@ -2,31 +2,37 @@
|
|||
|
||||
#include <cmath>
|
||||
#include <glm/glm.hpp>
|
||||
#include <numbers>
|
||||
|
||||
#include "helpers.hpp"
|
||||
#include "services/hid.hpp"
|
||||
|
||||
// Convert SDL sensor readings to 3DS format
|
||||
// We use the same code for Android as well, since the values we get from Android are in the same format as SDL (m/s^2 for acceleration, rad/s for
|
||||
// rotation)
|
||||
namespace Sensors::SDL {
|
||||
// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
|
||||
// Returns [pitch, roll, yaw]
|
||||
static glm::vec3 convertRotation(glm::vec3 rotation) {
|
||||
// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
|
||||
constexpr float scale = 180.f / std::numbers::pi * HIDService::gyroscopeCoeff;
|
||||
// The axes are also inverted, so invert scale before the multiplication.
|
||||
return rotation * -scale;
|
||||
}
|
||||
// Convert the rotation data we get from SDL sensor events to rotation data we can feed right to HID
|
||||
// Returns [pitch, roll, yaw]
|
||||
static glm::vec3 convertRotation(glm::vec3 rotation) {
|
||||
// Annoyingly, Android doesn't support the <numbers> header yet so we define pi ourselves
|
||||
static constexpr double pi = 3.141592653589793;
|
||||
// Convert the rotation from rad/s to deg/s and scale by the gyroscope coefficient in HID
|
||||
constexpr float scale = 180.f / pi * HIDService::gyroscopeCoeff;
|
||||
// The axes are also inverted, so invert scale before the multiplication.
|
||||
return rotation * -scale;
|
||||
}
|
||||
|
||||
static glm::vec3 convertAcceleration(float* data) {
|
||||
// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
|
||||
// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
|
||||
// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
|
||||
static constexpr float accelMax = 9.f;
|
||||
static glm::vec3 convertAcceleration(float* data) {
|
||||
// Set our cap to ~9 m/s^2. The 3DS sensors cap at -930 and +930, so values above this value will get clamped to 930
|
||||
// At rest (3DS laid flat on table), hardware reads around ~0 for x and z axis, and around ~480 for y axis due to gravity.
|
||||
// This code tries to mimic this approximately, with offsets based on measurements from my DualShock 4.
|
||||
static constexpr float accelMax = 9.f;
|
||||
// We define standard gravity(g) ourself instead of using the SDL one in order for the code to work on Android too.
|
||||
static constexpr float standardGravity = 9.80665f;
|
||||
|
||||
s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
|
||||
s16 y = std::clamp<s16>(s16(data[1] / (SDL_STANDARD_GRAVITY * accelMax) * 930.f - 350.f), -930, +930);
|
||||
s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
|
||||
s16 x = std::clamp<s16>(s16(data[0] / accelMax * 930.f), -930, +930);
|
||||
s16 y = std::clamp<s16>(s16(data[1] / (standardGravity * accelMax) * 930.f - 350.f), -930, +930);
|
||||
s16 z = std::clamp<s16>(s16((data[2] - 2.1f) / accelMax * 930.f), -930, +930);
|
||||
|
||||
return glm::vec3(x, y, z);
|
||||
}
|
||||
return glm::vec3(x, y, z);
|
||||
}
|
||||
} // namespace Sensors::SDL
|
||||
|
|
84
include/services/fonts.hpp
Normal file
84
include/services/fonts.hpp
Normal file
|
@ -0,0 +1,84 @@
|
|||
// Copyright 2016 Citra Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
// Adapted from https://github.com/PabloMK7/citra/blob/master/src/core/hle/service/apt/bcfnt/bcfnt.h
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "helpers.hpp"
|
||||
#include "swap.hpp"
|
||||
|
||||
namespace HLE::Fonts {
|
||||
struct CFNT {
|
||||
u8 magic[4];
|
||||
u16_le endianness;
|
||||
u16_le headerSize;
|
||||
u32_le version;
|
||||
u32_le fileSize;
|
||||
u32_le numBlocks;
|
||||
};
|
||||
|
||||
struct SectionHeader {
|
||||
u8 magic[4];
|
||||
u32_le sectionSize;
|
||||
};
|
||||
|
||||
struct FINF {
|
||||
u8 magic[4];
|
||||
u32_le sectionSize;
|
||||
u8 fontType;
|
||||
u8 lineFeed;
|
||||
u16_le alterCharIndex;
|
||||
u8 default_width[3];
|
||||
u8 encoding;
|
||||
u32_le tglpOffset;
|
||||
u32_le cwdhOffset;
|
||||
u32_le cmapOffset;
|
||||
u8 height;
|
||||
u8 width;
|
||||
u8 ascent;
|
||||
u8 reserved;
|
||||
};
|
||||
|
||||
struct TGLP {
|
||||
u8 magic[4];
|
||||
u32_le sectionSize;
|
||||
u8 cellWidth;
|
||||
u8 cellHeight;
|
||||
u8 baselinePosition;
|
||||
u8 maxCharacterWidth;
|
||||
u32_le sheetSize;
|
||||
u16_le numSheets;
|
||||
u16_le sheetImageFormat;
|
||||
u16_le numColumns;
|
||||
u16_le numRows;
|
||||
u16_le sheetWidth;
|
||||
u16_le sheetHeight;
|
||||
u32_le sheetDataOffset;
|
||||
};
|
||||
|
||||
struct CMAP {
|
||||
u8 magic[4];
|
||||
u32_le sectionSize;
|
||||
u16_le codeBegin;
|
||||
u16_le codeEnd;
|
||||
u16_le mappingMethod;
|
||||
u16_le reserved;
|
||||
u32_le nextCmapOffset;
|
||||
};
|
||||
|
||||
struct CWDH {
|
||||
u8 magic[4];
|
||||
u32_le sectionSize;
|
||||
u16_le startIndex;
|
||||
u16_le endIndex;
|
||||
u32_le nextCwdhOffset;
|
||||
};
|
||||
|
||||
// Relocates the internal addresses of the BCFNT Shared Font to the new base. The current base will
|
||||
// be auto-detected based on the file headers.
|
||||
void relocateSharedFont(u8* sharedFont, u32 newAddress);
|
||||
} // namespace HLE::Fonts
|
Loading…
Add table
Add a link
Reference in a new issue