From 609eb6d880e808e7e1e5b8f0cb510a3a0bcd2065 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Apr 2024 21:53:17 +0000
Subject: [PATCH] DSP HLE: Get ADPCM audio decoding working (#499)

* Start decoding ADPCM

* Fix accidentally skipping ADPCM samples

* DSP HLE: ADPCM weights are signed

* Format

* Format

* Fix broken amend
---
 include/audio/hle_core.hpp  |  45 +++++++++++--
 src/core/audio/hle_core.cpp | 127 +++++++++++++++++++++++++++++++++---
 2 files changed, 156 insertions(+), 16 deletions(-)
diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index 6d7b3ad1..b533727b 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -1,6 +1,9 @@
 #pragma once
 #include <array>
+#include <cassert>
+#include <deque>
 #include <queue>
+#include <span>
 #include <vector>
 
 #include "audio/dsp_core.hpp"
@@ -18,7 +21,7 @@ namespace Audio {
 			u32 paddr;        // Physical address of the buffer
 			u32 sampleCount;  // Total number of samples
 			u8 adpcmScale;    // ADPCM predictor/scale
-			u8 pad1;           // Unknown
+			u8 pad1;          // Unknown
 
 			std::array<s16, 2> previousSamples;  // ADPCM y[n-1] and y[n-2]
 			bool adpcmDirty;
@@ -39,17 +42,40 @@ namespace Audio {
 				return this->bufferID > other.bufferID;
 			}
 		};
+		// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
+		using SampleBuffer = std::deque<std::array<s16, 2>>;
 
 		using BufferQueue = std::priority_queue<Buffer>;
+		BufferQueue buffers;
 
 		std::array<float, 3> gain0, gain1, gain2;
 		u16 syncCount;
-		bool enabled;
+		bool enabled;  // Is the source enabled?
 
-		BufferQueue buffers;
+		// ADPCM decoding info:
+		// An array of fixed point S5.11 coefficients. These provide "weights" for the history samples
+		// The system describing how an ADPCM output sample is generated is
+		// y[n] = x[n] + 0.5 + coeff1 * y[n-1] + coeff2 * y[n-2]
+		// Where y[n] is the output sample we're generating, x[n] is the ADPCM "differential" of the current sample
+		// And coeff1/coeff2 are the coefficients from this array that are used for weighing the history samples
+		std::array<s16, 16> adpcmCoefficients;
+		s16 history1;  // y[n-1], the previous output sample
+		s16 history2;  // y[n-2], the previous previous output sample
+
+		SampleBuffer currentSamples;
 		int index = 0;  // Index of the voice in [0, 23] for debugging
 
 		void reset();
+		// Pop a buffer from the buffer queue and return it
+		Buffer popBuffer() {
+			assert(!buffers.empty());
+
+			Buffer ret = buffers.top();
+			buffers.pop();
+
+			return ret;
+		}
+
 		DSPSource() { reset(); }
 	};
 
@@ -61,7 +87,7 @@ namespace Audio {
 
 		template <typename T, usize channelCount>
 		using Frame = std::array<Sample<T, channelCount>, 160>;
-		
+
 		template <typename T>
 		using MonoFrame = Frame<T, 1>;
 
@@ -72,6 +98,8 @@ namespace Audio {
 		using QuadFrame = Frame<T, 4>;
 
 		using Source = Audio::DSPSource;
+		using SampleBuffer = Source::SampleBuffer;
+
 	  private:
 		enum class DSPState : u32 {
 			Off,
@@ -91,7 +119,7 @@ namespace Audio {
 		SourceType sourceType = SourceType::Stereo;
 
 		void resetAudioPipe();
-		bool loaded = false; // Have we loaded a component?
+		bool loaded = false;  // Have we loaded a component?
 
 		// Get the index for the current region we'll be reading. Returns the region with the highest frame counter
 		// Accounting for whether one of the frame counters has wrapped around
@@ -130,10 +158,13 @@ namespace Audio {
 			}
 		}
 
-		void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config);
+		void updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients);
 		void generateFrame(StereoFrame<s16>& frame);
 		void outputFrame();
-		void dumpBuffer(const Source::Buffer& buffer);
+		// Decode an entire buffer worth of audio
+		void decodeBuffer(DSPSource& source);
+		SampleBuffer decodeADPCM(const u8* data, usize sampleCount, Source& source);
+
 	  public:
 		HLE_DSP(Memory& mem, Scheduler& scheduler, DSPService& dspService);
 		~HLE_DSP() override {}
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 245894ce..4ee5a1dc 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -1,5 +1,7 @@
 #include "audio/hle_core.hpp"
 
+#include <algorithm>
+#include <cassert>
 #include <thread>
 #include <utility>
 
@@ -105,7 +107,7 @@ namespace Audio {
 		outputFrame();
 		scheduler.addEvent(Scheduler::EventType::RunDSP, scheduler.currentTimestamp + Audio::cyclesPerFrame);
 	}
-	
+
 	u16 HLE_DSP::recvData(u32 regId) {
 		if (regId != 0) {
 			Helpers::panic("Audio: invalid register in HLE frontend");
@@ -139,14 +141,11 @@ namespace Audio {
 							// TODO: Other initialization stuff here
 							dspState = DSPState::On;
 							resetAudioPipe();
-							
+
 							dspService.triggerPipeEvent(DSPPipeType::Audio);
 							break;
 
-						case StateChange::Shutdown:
-							dspState = DSPState::Off;
-							break;
-
+						case StateChange::Shutdown: dspState = DSPState::Off; break;
 						default: Helpers::panic("Unimplemented DSP audio pipe state change %d", state);
 					}
 				}
@@ -210,7 +209,7 @@ namespace Audio {
 			// Update source configuration from the read region of shared memory
 			auto& config = read.sourceConfigurations.config[i];
 			auto& source = sources[i];
-			updateSourceConfig(source, config);
+			updateSourceConfig(source, config, read.adpcmCoefficients.coeff[i]);
 
 			// Generate audio
 			if (source.enabled && !source.buffers.empty()) {
@@ -229,7 +228,7 @@ namespace Audio {
 		}
 	}
 
-	void HLE_DSP::updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config) {
+	void HLE_DSP::updateSourceConfig(Source& source, HLE::SourceConfiguration::Configuration& config, s16_le* adpcmCoefficients) {
 		// Check if the any dirty bit is set, otherwise exit early
 		if (!config.dirtyRaw) {
 			return;
@@ -245,6 +244,15 @@ namespace Audio {
 			source.syncCount = config.syncCount;
 		}
 
+		if (config.adpcmCoefficientsDirty) {
+			config.adpcmCoefficientsDirty = 0;
+			// Convert the ADPCM coefficients in DSP shared memory from s16_le to s16 and cache them in source.adpcmCoefficients
+			std::transform(
+				adpcmCoefficients, adpcmCoefficients + source.adpcmCoefficients.size(), source.adpcmCoefficients.begin(),
+				[](const s16_le& input) -> s16 { return s16(input); }
+			);
+		}
+
 		if (config.resetFlag) {
 			config.resetFlag = 0;
 			source.reset();
@@ -254,7 +262,7 @@ namespace Audio {
 			config.partialResetFlag = 0;
 			source.buffers = {};
 		}
-		
+
 		// TODO: Should we check bufferQueueDirty here too?
 		if (config.formatDirty || config.embeddedBufferDirty) {
 			sampleFormat = config.format;
@@ -302,6 +310,107 @@ namespace Audio {
 		config.dirtyRaw = 0;
 	}
 
+	void HLE_DSP::decodeBuffer(DSPSource& source) {
+		if (source.buffers.empty()) {
+			// No queued buffers, there's nothing to decode so return
+			return;
+		}
+
+		DSPSource::Buffer buffer = source.popBuffer();
+		if (buffer.adpcmDirty) {
+			source.history1 = buffer.previousSamples[0];
+			source.history2 = buffer.previousSamples[1];
+		}
+
+		const u8* data = getPointerPhys<u8>(buffer.paddr);
+		if (data == nullptr) {
+			return;
+		}
+
+		switch (buffer.format) {
+			case SampleFormat::PCM8:
+			case SampleFormat::PCM16: Helpers::warn("Unimplemented sample format!"); break;
+
+			case SampleFormat::ADPCM: source.currentSamples = decodeADPCM(data, buffer.sampleCount, source); break;
+			default: Helpers::warn("Invalid DSP sample format"); break;
+		}
+	}
+
+	HLE_DSP::SampleBuffer HLE_DSP::decodeADPCM(const u8* data, usize sampleCount, Source& source) {
+		static constexpr uint samplesPerBlock = 14;
+		// An ADPCM block is comprised of a single header which contains the scale and predictor value for the block, and then 14 4bpp samples (hence
+		// the / 2)
+		static constexpr usize blockSize = sizeof(u8) + samplesPerBlock / 2;
+
+		// How many ADPCM blocks we'll be consuming. It's sampleCount / samplesPerBlock, rounded up.
+		const usize blockCount = (sampleCount + (samplesPerBlock - 1)) / samplesPerBlock;
+		const usize outputSize = sampleCount + (sampleCount & 1);  // Bump the output size to a multiple of 2
+
+		usize outputCount = 0;  // How many stereo samples have we output thus far?
+		SampleBuffer decodedSamples(outputSize);
+
+		s16 history1 = source.history1;
+		s16 history2 = source.history2;
+
+		// Decode samples in frames. Stop when we reach sampleCount samples
+		for (uint blockIndex = 0; blockIndex < blockCount; blockIndex++) {
+			const u8 scaleAndPredictor = *data++;
+
+			const u32 scale = 1 << u32(scaleAndPredictor & 0xF);
+			// This is referred to as 4-bit in some documentation, but I am pretty sure that's a mistake
+			const u32 predictor = (scaleAndPredictor >> 4) & 0x7;
+
+			// Fixed point (s5.11) coefficients for the history samples
+			const s32 weight1 = source.adpcmCoefficients[predictor * 2];
+			const s32 weight2 = source.adpcmCoefficients[predictor * 2 + 1];
+
+			// Decode samples in batches of 2
+			// Each 4 bit ADPCM differential corresponds to 1 mono sample which will be output from both the left and right channel
+			// So each byte of ADPCM data ends up generating 2 stereo samples
+			for (uint sampleIndex = 0; sampleIndex < samplesPerBlock && outputCount < sampleCount; sampleIndex += 2) {
+				const auto decode = [&](s32 nibble) -> s16 {
+					static constexpr s32 ONE = 0x800;     // 1.0 in S5.11 fixed point
+					static constexpr s32 HALF = ONE / 2;  // 0.5 similarly
+
+					// Sign extend our nibble from s4 to s32
+					nibble = (nibble << 28) >> 28;
+
+					// Scale the extended nibble by the scale specified in the ADPCM block header, to get the real value of the sample's differential
+					const s32 diff = nibble * scale;
+
+					// Convert ADPCM to PCM using y[n] = x[n] + 0.5 + coeff1 * y[n - 1] + coeff2 * y[n - 2]
+					// The coefficients are in s5.11 fixed point so we also perform the proper conversions
+					s32 output = ((diff << 11) + HALF + weight1 * history1 + weight2 * history2) >> 11;
+					output = std::clamp<s32>(output, -32768, 32767);
+
+					// Write back new history samples
+					history2 = history1;  // y[n-2] = y[n-1]
+					history1 = output;    // y[n-1] = y[n]
+
+					return s16(output);
+				};
+
+				const u8 samples = *data++;                   // Fetch the byte containing 2 4-bpp samples
+				const s32 topNibble = s32(samples) >> 4;      // First sample
+				const s32 bottomNibble = s32(samples) & 0xF;  // Second sample
+
+				// Decode and write first sample, then the second one
+				const s16 sample1 = decode(topNibble);
+				decodedSamples[outputCount].fill(sample1);
+
+				const s16 sample2 = decode(bottomNibble);
+				decodedSamples[outputCount + 1].fill(sample2);
+
+				outputCount += 2;
+			}
+		}
+
+		// Store new history samples in the DSP source and return samples
+		source.history1 = history1;
+		source.history2 = history2;
+		return decodedSamples;
+	}
+
 	void DSPSource::reset() {
 		enabled = false;
 		syncCount = 0;