From 0996252c8fa254923a5b2cba890974b57c4d6b99 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 16 Jul 2023 20:19:52 +0300
Subject: [PATCH] Approximate VFP timings

---
 src/core/CPU/dynarmic_cycles.cpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/core/CPU/dynarmic_cycles.cpp b/src/core/CPU/dynarmic_cycles.cpp
index e85abc71..25b57efd 100644
--- a/src/core/CPU/dynarmic_cycles.cpp
+++ b/src/core/CPU/dynarmic_cycles.cpp
@@ -98,6 +98,23 @@ namespace {
         return 1 + std::popcount(i.template Get<"x">()) / 2;
     }
 
+    u64 LoadStoreVectorMultiple(auto i, bool is64Bit) {
+		const size_t rd = i.template Get<"d">();
+		size_t regs = i.template Get<"v">();
+		if (is64Bit) {
+			regs /= 2;
+		}
+
+		// Invalid configuration
+		if (regs == 0 || rd + regs > 32) {
+			return 1;
+		}
+
+		// 1 cycle base cost and then + 1 cycle per every 2 registers in the rlist
+		// See FLDM/FSTM here https://developer.arm.com/documentation/ddi0274/h/instruction-execution/execution-timing
+		return 1 + regs / 2;
+	}
+
     u64 SupervisorCall(auto i) {
         // Consume extra cycles for the GetSystemTick SVC since some games wait with it in a loop rather than
         // Properly sleeping until a VBlank interrupt
@@ -389,6 +406,21 @@ namespace {
         INST("RFE",                 "1111100--0-1----0000101000000000",  9) // v6
         INST("SRS",                 "1111100--1-0110100000101000-----",  1) // v6
 
+        // We attempt to emulate VFP timings sort-of. We assume a penalty for VFP memory loads and stores.
+        // However we still consider arithmetic instructions to be 1 cycle, even for relatively slow ones like vdiv
+        // This is because it's likely VFP arithmetic instructions run asynchronously to the core integer ISA, which
+        // means that a vdiv will possibly not stall. If we did apply a cycle penalty, the emulated CPU would likely run slower than hw.
+        // Attempting to approximate VFP timings will likely require a lot of baremetal research.
+        INST("VLDM.64",             "cccc110pudw1nnnndddd1011vvvvvvvv", LoadStoreVectorMultiple(i, true)) // VFP v2
+        INST("VLDM.32",             "cccc110pudw1nnnndddd1010vvvvvvvv", LoadStoreVectorMultiple(i, false)) // VFP v2
+        INST("VSTM.64",             "cccc110pudw0nnnndddd1011vvvvvvvv", LoadStoreVectorMultiple(i, true)) // VFP v2
+        INST("VSTM.32",             "cccc110pudw0nnnndddd1010vvvvvvvv", LoadStoreVectorMultiple(i, false)) // VFP v2
+
+        INST("VLDR.64",             "cccc1101ud01nnnndddd1011vvvvvvvv", 2) // VFP v2
+        INST("VLDR.32",             "cccc1101ud01nnnndddd1010vvvvvvvv", 2) // VFP v2
+        INST("VSTR.64",             "cccc1101ud00nnnndddd1011vvvvvvvv", 2) // VFP v2
+        INST("VSTR.32",             "cccc1101ud00nnnndddd1010vvvvvvvv", 2) // VFP v2
+
         // clang-format on
     };