mirror of
https://github.com/wheremyfoodat/Panda3DS.git
synced 2025-04-06 06:05:40 +12:00
Merge pull request #640 from wheremyfoodat/wheremyfoodat-patch-2
Force-inline SIMD index buffer functions
This commit is contained in:
commit
43991b7653
2 changed files with 10 additions and 5 deletions
|
@ -3,6 +3,7 @@
|
|||
#include <limits>
|
||||
#include <utility>
|
||||
|
||||
#include "compiler_builtins.hpp"
|
||||
#include "helpers.hpp"
|
||||
|
||||
#if defined(_M_AMD64) || defined(__x86_64__)
|
||||
|
@ -43,7 +44,7 @@ namespace PICA::IndexBuffer {
|
|||
|
||||
#ifdef PICA_SIMD_ARM64
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
|
||||
ALWAYS_INLINE std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
|
||||
// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
|
||||
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
|
||||
|
||||
|
@ -134,7 +135,7 @@ namespace PICA::IndexBuffer {
|
|||
|
||||
#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
|
||||
template <bool useShortIndices>
|
||||
std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
|
||||
ALWAYS_INLINE std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
|
||||
// We process 16 bytes per iteration, which is 8 vertices if we're using u16
|
||||
// indices or 16 vertices if we're using u8 indices
|
||||
constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
|
||||
|
|
|
@ -90,7 +90,11 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
|||
const u32 size = (attribInfo >> 2) + 1; // Total number of components
|
||||
|
||||
// Size of each component based on the attribute type
|
||||
static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
|
||||
[[maybe_unused]] static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
|
||||
// To avoid a multiplication, instead of multiplying by the above values, we shift left instead
|
||||
// So multiplication by 1 becomes a shift by 0, mul by 2 becomes a shift by 1, and mul by 4 becomes a shift by 2
|
||||
static constexpr u32 sizeShiftPerComponent[4] = {0, 0, 1, 2};
|
||||
|
||||
const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
|
||||
// Mark the attribute as enabled
|
||||
accel.enabledAttributeMask |= 1 << inputReg;
|
||||
|
@ -100,7 +104,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
|||
attr.offset = attributeOffset + loaderOffset;
|
||||
attr.stride = loaderData.size;
|
||||
attr.type = attribType;
|
||||
attributeOffset += size * sizePerComponent[attribType];
|
||||
attributeOffset += size << sizeShiftPerComponent[attribType];
|
||||
}
|
||||
|
||||
loaderOffset += loader.size;
|
||||
|
@ -134,4 +138,4 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
|
|||
}
|
||||
|
||||
accel.canBeAccelerated = true;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue