Merge pull request #632 from wheremyfoodat/more-dsp

WIP: Finishing DSP mixer
This commit is contained in:
wheremyfoodat 2024-11-20 21:35:43 +02:00 committed by GitHub
commit b2c0f18e62
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 370 additions and 56 deletions

View file

@ -314,6 +314,7 @@ set(APPLET_SOURCE_FILES src/core/applets/applet.cpp src/core/applets/mii_selecto
)
set(AUDIO_SOURCE_FILES src/core/audio/dsp_core.cpp src/core/audio/null_core.cpp src/core/audio/teakra_core.cpp
src/core/audio/miniaudio_device.cpp src/core/audio/hle_core.cpp src/core/audio/aac_decoder.cpp
src/core/audio/audio_interpolation.cpp
)
set(RENDERER_SW_SOURCE_FILES src/core/renderer_sw/renderer_sw.cpp)
@ -354,6 +355,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp include/audio/dsp_simd.hpp
)
cmrc_add_resource_library(

View file

@ -0,0 +1,58 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <deque>
#include "audio/hle_mixer.hpp"
#include "helpers.hpp"
namespace Audio::Interpolation {
// A variable length buffer of signed PCM16 stereo samples.
using StereoBuffer16 = std::deque<std::array<s16, 2>>;
using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;
struct State {
// Two historical samples.
std::array<s16, 2> xn1 = {}; //< x[n-1]
std::array<s16, 2> xn2 = {}; //< x[n-2]
// Current fractional position.
u64 fposition = 0;
};
/**
* No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay.
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
/**
* Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay.
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
/**
* Polyphase interpolation. This is currently stubbed to just perform linear interpolation
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
} // namespace Audio::Interpolation

View file

@ -0,0 +1,78 @@
#pragma once
#include "audio/hle_mixer.hpp"
#include "compiler_builtins.hpp"
#include "helpers.hpp"
#if defined(_M_AMD64) || defined(__x86_64__)
#define DSP_SIMD_X64
#include <immintrin.h>
#elif defined(_M_ARM64) || defined(__aarch64__)
#define DSP_SIMD_ARM64
#include <arm_neon.h>
#endif
// Optimized SIMD functions for mixing the stereo output of a DSP voice into a quadraphonic intermediate mix
namespace DSP::MixIntoQuad {
using IntermediateMix = Audio::DSPMixer::IntermediateMix;
using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;
// Non-SIMD, portable algorithm
ALWAYS_INLINE static void mixPortable(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// Mono samples are in the format: (l, r)
// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
mix[sampleIndex][0] += s32(frame[sampleIndex][0] * gains[0]);
mix[sampleIndex][1] += s32(frame[sampleIndex][1] * gains[1]);
mix[sampleIndex][2] += s32(frame[sampleIndex][0] * gains[2]);
mix[sampleIndex][3] += s32(frame[sampleIndex][1] * gains[3]);
}
}
#if defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
ALWAYS_INLINE static void mixSSE4_1(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
__m128 gains_ = _mm_load_ps(gains);
for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// The stereo samples, repeated every 4 bytes inside the vector register
__m128i stereoSamples = _mm_castps_si128(_mm_load1_ps((float*)&frame[sampleIndex][0]));
__m128 currentFrame = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(stereoSamples));
__m128i offset = _mm_cvttps_epi32(_mm_mul_ps(currentFrame, gains_));
__m128i intermediateMixPrev = _mm_load_si128((__m128i*)&mix[sampleIndex][0]);
__m128i result = _mm_add_epi32(intermediateMixPrev, offset);
_mm_store_si128((__m128i*)&mix[sampleIndex][0], result);
}
}
#endif
#ifdef DSP_SIMD_ARM64
ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
float32x4_t gains_ = vld1q_f32(gains);
for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// Load l and r samples and repeat them every 4 bytes
int32x4_t stereoSamples = vld1q_dup_s32((s32*)&frame[sampleIndex][0]);
// Expand the bottom 4 s16 samples into an int32x4 with sign extension, then convert them to float32x4
float32x4_t currentFrame = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(stereoSamples))));
// Multiply samples by their respective gains, truncate the result, and add it into the intermediate mix buffer
int32x4_t offset = vcvtq_s32_f32(vmulq_f32(currentFrame, gains_));
int32x4_t intermediateMixPrev = vld1q_s32((s32*)&mix[sampleIndex][0]);
int32x4_t result = vaddq_s32(intermediateMixPrev, offset);
vst1q_s32((s32*)&mix[sampleIndex][0], result);
}
}
#endif
// Mixes the stereo output of a DSP voice into a quadraphonic intermediate mix
static void mix(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
#if defined(DSP_SIMD_ARM64)
return mixNEON(mix, frame, gains);
#elif defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
return mixSSE4_1(mix, frame, gains);
#else
return mixPortable(mix, frame, gains);
#endif
}
} // namespace DSP::MixIntoQuad

View file

@ -8,14 +8,13 @@
#include "audio/aac.hpp"
#include "audio/aac_decoder.hpp"
#include "audio/audio_interpolation.hpp"
#include "audio/dsp_core.hpp"
#include "audio/dsp_shared_mem.hpp"
#include "audio/hle_mixer.hpp"
#include "memory.hpp"
namespace Audio {
using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;
struct DSPSource {
// Audio buffer information
// https://www.3dbrew.org/wiki/DSP_Memory_Region
@ -47,14 +46,29 @@ namespace Audio {
// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
using SampleBuffer = std::deque<std::array<s16, 2>>;
using BufferQueue = std::priority_queue<Buffer>;
using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
using InterpolationState = Audio::Interpolation::State;
// The samples this voice output for this audio frame.
// Aligned to 4 for SIMD purposes.
alignas(4) DSPMixer::StereoFrame<s16> currentFrame;
BufferQueue buffers;
SampleFormat sampleFormat = SampleFormat::ADPCM;
SourceType sourceType = SourceType::Stereo;
InterpolationMode interpolationMode = InterpolationMode::Linear;
InterpolationState interpolationState;
// There's one gain configuration for each of the 3 intermediate mixing stages
// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
// Aligned to 16 for SIMD purposes
alignas(16) std::array<std::array<float, 4>, 3> gains;
// Of the 3 intermediate mix stages, typically only the first one is actually enabled and the other ones do nothing
// Ie their gain is vec4(0.0). We track which stages are disabled (have a gain of all 0s) using this bitfield and skip them
// In order to save up on CPU time.
uint enabledMixStages = 0;
std::array<float, 3> gain0, gain1, gain2;
u32 samplePosition; // Sample number into the current audio buffer
float rateMultiplier;
u16 syncCount;
@ -95,42 +109,6 @@ namespace Audio {
DSPSource() { reset(); }
};
class DSPMixer {
public:
template <typename T, usize channelCount = 1>
using Sample = std::array<T, channelCount>;
template <typename T, usize channelCount>
using Frame = std::array<Sample<T, channelCount>, 160>;
template <typename T>
using MonoFrame = Frame<T, 1>;
template <typename T>
using StereoFrame = Frame<T, 2>;
template <typename T>
using QuadFrame = Frame<T, 4>;
private:
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
static constexpr usize mixerStageCount = 3;
public:
ChannelFormat channelFormat = ChannelFormat::Stereo;
std::array<float, mixerStageCount> volumes;
std::array<bool, 2> enableAuxStages;
void reset() {
channelFormat = ChannelFormat::Stereo;
volumes.fill(0.0);
enableAuxStages.fill(false);
}
};
class HLE_DSP : public DSPCore {
// The audio frame types are public in case we want to use them for unit tests
public:
@ -151,6 +129,7 @@ namespace Audio {
using Source = Audio::DSPSource;
using SampleBuffer = Source::SampleBuffer;
using IntermediateMix = DSPMixer::IntermediateMix;
private:
enum class DSPState : u32 {
@ -218,7 +197,7 @@ namespace Audio {
void outputFrame();
// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
// Decode an entire buffer worth of audio
void decodeBuffer(DSPSource& source);
@ -245,5 +224,4 @@ namespace Audio {
void setSemaphore(u16 value) override {}
void setSemaphoreMask(u16 value) override {}
};
} // namespace Audio

View file

@ -0,0 +1,50 @@
#pragma once
#include <array>
#include "audio/dsp_shared_mem.hpp"
#include "helpers.hpp"
namespace Audio {
using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;
class DSPMixer {
public:
template <typename T, usize channelCount = 1>
using Sample = std::array<T, channelCount>;
template <typename T, usize channelCount>
using Frame = std::array<Sample<T, channelCount>, 160>;
template <typename T>
using MonoFrame = Frame<T, 1>;
template <typename T>
using StereoFrame = Frame<T, 2>;
template <typename T>
using QuadFrame = Frame<T, 4>;
// Internally the DSP uses four channels when mixing.
// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
using IntermediateMix = QuadFrame<s32>;
private:
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
static constexpr usize mixerStageCount = 3;
public:
ChannelFormat channelFormat = ChannelFormat::Stereo;
std::array<float, mixerStageCount> volumes;
std::array<bool, 2> enableAuxStages;
void reset() {
channelFormat = ChannelFormat::Stereo;
volumes.fill(0.0);
enableAuxStages.fill(false);
}
};
} // namespace Audio

View file

@ -0,0 +1,73 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "audio/audio_interpolation.hpp"
#include <algorithm>
#include "helpers.hpp"
namespace Audio::Interpolation {
// Calculations are done in fixed point with 24 fractional bits.
// (This is not verified. This was chosen for minimal error.)
static constexpr u64 scaleFactor = 1 << 24;
static constexpr u64 scaleMask = scaleFactor - 1;
/// Here we step over the input in steps of rate, until we consume all of the input.
/// Three adjacent samples are passed to fn each step.
template <typename Function>
static void stepOverSamples(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi, Function fn) {
if (input.empty()) {
return;
}
input.insert(input.begin(), {state.xn2, state.xn1});
const u64 step_size = static_cast<u64>(rate * scaleFactor);
u64 fposition = state.fposition;
usize inputi = 0;
while (outputi < output.size()) {
inputi = static_cast<usize>(fposition / scaleFactor);
if (inputi + 2 >= input.size()) {
inputi = input.size() - 2;
break;
}
u64 fraction = fposition & scaleMask;
output[outputi++] = fn(fraction, input[inputi], input[inputi + 1], input[inputi + 2]);
fposition += step_size;
}
state.xn2 = input[inputi];
state.xn1 = input[inputi + 1];
state.fposition = fposition - inputi * scaleFactor;
input.erase(input.begin(), std::next(input.begin(), inputi + 2));
}
void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { return x0; });
}
void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
// Note on accuracy: Some values that this produces are +/- 1 from the actual firmware.
stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
// This is a saturated subtraction. (Verified by black-box fuzzing.)
s64 delta0 = std::clamp<s64>(x1[0] - x0[0], -32768, 32767);
s64 delta1 = std::clamp<s64>(x1[1] - x0[1], -32768, 32767);
return std::array<s16, 2>{
static_cast<s16>(x0[0] + fraction * delta0 / scaleFactor),
static_cast<s16>(x0[1] + fraction * delta1 / scaleFactor),
};
});
}
void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
linear(state, input, rate, output, outputi);
}
} // namespace Audio::Interpolation

View file

@ -7,6 +7,7 @@
#include <utility>
#include "audio/aac_decoder.hpp"
#include "audio/dsp_simd.hpp"
#include "services/dsp.hpp"
namespace Audio {
@ -211,11 +212,11 @@ namespace Audio {
if (audioEnabled) {
// Wait until we've actually got room to push our frame
while (sampleBuffer.size() + 2 > sampleBuffer.Capacity()) {
while (sampleBuffer.size() + frame.size() * 2 > sampleBuffer.Capacity()) {
std::this_thread::sleep_for(std::chrono::milliseconds{1});
}
sampleBuffer.push(frame.data(), frame.size());
sampleBuffer.push(frame.data(), frame.size() * 2);
}
}
@ -229,6 +230,9 @@ namespace Audio {
read.dspConfiguration.dirtyRaw = 0;
read.dspConfiguration.dirtyRaw2 = 0;
// The intermediate mix buffer is aligned to 16 for SIMD purposes
alignas(16) std::array<IntermediateMix, 3> mixes{};
for (int i = 0; i < sourceCount; i++) {
// Update source configuration from the read region of shared memory
auto& config = read.sourceConfigurations.config[i];
@ -250,6 +254,27 @@ namespace Audio {
status.samplePosition = source.samplePosition;
source.isBufferIDDirty = false;
// If the source is still enabled, mix its output into the intermediate mix buffers
if (source.enabled) {
for (int mix = 0; mix < mixes.size(); mix++) {
// Check if this stage is passthrough, and if it is, then skip it
if ((source.enabledMixStages & (1u << mix)) == 0) {
continue;
}
IntermediateMix& intermediateMix = mixes[mix];
const std::array<float, 4>& gains = source.gains[mix];
DSP::MixIntoQuad::mix(intermediateMix, source.currentFrame, gains.data());
}
}
}
for (int i = 0; i < Audio::samplesInFrame; i++) {
auto& mix0 = mixes[0];
auto& sample = mix0[i];
frame[i] = {s16(sample[0]), s16(sample[1])};
}
performMix(read, write);
@ -300,6 +325,10 @@ namespace Audio {
source.sourceType = config.monoOrStereo;
}
if (config.interpolationDirty) {
source.interpolationMode = config.interpolationMode;
}
if (config.rateMultiplierDirty) {
source.rateMultiplier = (config.rateMultiplier > 0.f) ? config.rateMultiplier : 1.f;
}
@ -374,6 +403,28 @@ namespace Audio {
}
}
#define CONFIG_GAIN(index) \
if (config.gain##index##Dirty) { \
auto& dest = source.gains[index]; \
auto& sourceGain = config.gain[index]; \
\
dest[0] = float(sourceGain[0]); \
dest[1] = float(sourceGain[1]); \
dest[2] = float(sourceGain[2]); \
dest[3] = float(sourceGain[3]); \
\
if (dest[0] == 0.f && dest[1] == 0.f && dest[2] == 0.f && dest[3] == 0.f) { \
source.enabledMixStages &= ~(1u << index); \
} else { \
source.enabledMixStages |= (1u << index); \
} \
}
CONFIG_GAIN(0);
CONFIG_GAIN(1);
CONFIG_GAIN(2);
#undef CONFIG_GAIN
config.dirtyRaw = 0;
}
@ -433,6 +484,10 @@ namespace Audio {
}
void HLE_DSP::generateFrame(DSPSource& source) {
// Zero out all output samples at first. TODO: Don't zero out the entire frame initially, rather only zero-out the "unwritten" samples when
// the frame is done being processed.
source.currentFrame = {};
if (source.currentSamples.empty()) {
// There's no audio left to play, turn the voice off
if (source.buffers.empty()) {
@ -446,10 +501,10 @@ namespace Audio {
decodeBuffer(source);
} else {
uint maxSampleCount = uint(float(Audio::samplesInFrame) * source.rateMultiplier);
uint outputCount = 0;
usize outputCount = 0;
static constexpr usize maxSamples = Audio::samplesInFrame;
while (outputCount < maxSampleCount) {
while (outputCount < maxSamples) {
if (source.currentSamples.empty()) {
if (source.buffers.empty()) {
break;
@ -458,13 +513,28 @@ namespace Audio {
}
}
const uint sampleCount = std::min<s32>(maxSampleCount - outputCount, source.currentSamples.size());
switch (source.interpolationMode) {
case Source::InterpolationMode::Linear:
Audio::Interpolation::linear(
source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
);
break;
case Source::InterpolationMode::None:
Audio::Interpolation::none(
source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
);
break;
// samples.insert(samples.end(), source.currentSamples.begin(), source.currentSamples.begin() + sampleCount);
source.currentSamples.erase(source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount));
source.samplePosition += sampleCount;
outputCount += sampleCount;
case Source::InterpolationMode::Polyphase:
// Currently stubbed to be the same as linear
Audio::Interpolation::polyphase(
source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
);
break;
}
}
source.samplePosition += u32(outputCount * source.rateMultiplier);
}
}
@ -488,7 +558,7 @@ namespace Audio {
if (config.outputFormatDirty) {
mixer.channelFormat = config.outputFormat;
}
if (config.masterVolumeDirty) {
mixer.volumes[0] = config.masterVolume;
}
@ -496,7 +566,7 @@ namespace Audio {
if (config.auxVolume0Dirty) {
mixer.volumes[1] = config.auxVolumes[0];
}
if (config.auxVolume1Dirty) {
mixer.volumes[2] = config.auxVolumes[1];
}
@ -658,7 +728,7 @@ namespace Audio {
response = request;
response.resultCode = AAC::ResultCode::Success;
break;
default: Helpers::warn("Unknown AAC command type"); break;
}
@ -675,6 +745,7 @@ namespace Audio {
// Initialize these to some sane defaults
sampleFormat = SampleFormat::ADPCM;
sourceType = SourceType::Stereo;
interpolationMode = InterpolationMode::Linear;
samplePosition = 0;
previousBufferID = 0;
@ -683,6 +754,10 @@ namespace Audio {
rateMultiplier = 1.f;
buffers = {};
interpolationState = {};
currentSamples.clear();
gains.fill({});
enabledMixStages = 0;
}
} // namespace Audio