From 5d509181c6ef768aedfadd63d56b9b20c950fb5d Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 17:10:42 -0500 Subject: [PATCH 1/7] Prevent event loss when a disk write fails during Flush() OfflineStorageHandler::Flush() moved records out of the in-memory queue with GetRecords() (which removes them) before handing them to the disk store via StoreRecords(). If the disk write failed for some or all records (e.g. a SQLite write error), those records had already been removed from memory and were never re-queued, so events were silently lost -- only the smaller totalSaved count hinted at it. Reserve-then-confirm-delete instead: - Reserve the records in memory (GetAndReserveRecords with a nominal lease) so the originals stay retrievable rather than being dropped outright. - Persist them one at a time, recording exactly which ids succeeded and failed. - DeleteRecords() only the persisted ids (now safely on disk). - ReleaseRecords() the failed ids back into the in-memory queue so they are retried on a subsequent flush instead of being lost. Added OfflineStorageHandlerFlushTests.FailedDiskWriteDuringFlushReturnsRecordsToMemory: records that the SQLite store rejects (timestamp<=0) remain in memory after Flush(). Verified the test fails against the previous GetRecords()-based Flush (GetRecordCount()==0) and passes with this change. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/offline/OfflineStorageHandler.cpp | 57 ++++++++++++++------- tests/unittests/OfflineStorageTests.cpp | 66 +++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 17 deletions(-) diff --git a/lib/offline/OfflineStorageHandler.cpp b/lib/offline/OfflineStorageHandler.cpp index 9049339c4..9b3169e7d 100644 --- a/lib/offline/OfflineStorageHandler.cpp +++ b/lib/offline/OfflineStorageHandler.cpp @@ -177,28 +177,51 @@ namespace MAT_NS_BEGIN { size_t dbSizeBeforeFlush = m_offlineStorageMemory->GetSize(); if ((m_offlineStorageMemory) && (dbSizeBeforeFlush > 0) && (m_offlineStorageDisk)) { - // This will block on and then take a lock for the duration of this move, and - // StoreRecord() will then block until the move completes. - auto records = m_offlineStorageMemory->GetRecords(false, EventLatency_Unspecified); - std::vector ids; - - // TODO: [MG] - consider running the batch in transaction - // if (sqlite) - // sqlite->Execute("BEGIN"); - - size_t totalSaved = m_offlineStorageDisk->StoreRecords(records); - - // TODO: [MG] - consider running the batch in transaction - // if (sqlite) - // sqlite->Execute("END"); + // Reserve the records in memory (with a nominal lease) instead of + // removing them outright. The lease duration is not time-critical + // here because the records are persisted and resolved synchronously + // under m_flushLock; reserving simply keeps the originals retrievable + // so any that fail to persist can be returned to the queue. + const unsigned reserveLeaseMs = 120000; + std::vector records; + auto consumer = [&records](StorageRecord&& record) -> bool { + records.push_back(std::move(record)); + return true; + }; + m_offlineStorageMemory->GetAndReserveRecords(consumer, reserveLeaseMs, EventLatency_Unspecified); + + // Persist to disk one record at a time, tracking exactly which records + // were stored so we only drop the persisted ones from memory. + std::vector persistedIds; + std::vector failedIds; + persistedIds.reserve(records.size()); + for (auto& record : records) + { + if (m_offlineStorageDisk->StoreRecord(record)) + persistedIds.push_back(record.id); + else + failedIds.push_back(record.id); + } - // Delete records from reserved on flush HttpHeaders dummy; bool fromMemory = true; - m_offlineStorageMemory->DeleteRecords(ids, dummy, fromMemory); + + // Persisted records are now safely on disk: drop them from memory. + if (!persistedIds.empty()) + m_offlineStorageMemory->DeleteRecords(persistedIds, dummy, fromMemory); + + // Records that failed to persist are returned to the in-memory queue + // so a partial or total disk write failure does not silently lose + // events; they are retried on a subsequent flush. + if (!failedIds.empty()) + { + LOG_WARN("Flush: %zu of %zu records failed to persist to disk; returning them to the queue for retry", + failedIds.size(), records.size()); + m_offlineStorageMemory->ReleaseRecords(failedIds, false, dummy, fromMemory); + } // Notify event listener about the records cached - OnStorageRecordsSaved(totalSaved); + OnStorageRecordsSaved(persistedIds.size()); if (m_offlineStorageMemory->GetSize() > dbSizeBeforeFlush) { diff --git a/tests/unittests/OfflineStorageTests.cpp b/tests/unittests/OfflineStorageTests.cpp index bbb8da8e0..36da3029b 100644 --- a/tests/unittests/OfflineStorageTests.cpp +++ b/tests/unittests/OfflineStorageTests.cpp @@ -2,7 +2,13 @@ #include "common/Common.hpp" #include "common/MockIOfflineStorage.hpp" +#include "common/MockIOfflineStorageObserver.hpp" +#include "common/MockIRuntimeConfig.hpp" +#include "offline/OfflineStorageHandler.hpp" #include "offline/StorageObserver.hpp" +#include "NullObjects.hpp" + +#include using namespace testing; using namespace MAT; @@ -162,3 +168,63 @@ TEST_F(OfflineStorageTests, ReleaseRecordsIsForwarded) .WillOnce(Return()); EXPECT_THAT(offlineStorage.releaseRecordsIncRetryCount(ctx), true); } + +namespace +{ + // Dispatcher that drops queued work, so flushes only run when invoked directly. + class NoopTaskDispatcher : public ITaskDispatcher + { + public: + void Join() override {} + void Queue(Task* task) override { UNREFERENCED_PARAMETER(task); } + bool Cancel(Task* task, uint64_t waitTime = 0) override + { + UNREFERENCED_PARAMETER(task); + UNREFERENCED_PARAMETER(waitTime); + return true; + } + }; +} + +// Regression test: when records pulled from the in-memory queue fail to persist +// to disk during Flush(), they must be returned to the queue rather than lost. +TEST(OfflineStorageHandlerFlushTests, FailedDiskWriteDuringFlushReturnsRecordsToMemory) +{ + NullLogManager logManager; + NiceMock config; + NoopTaskDispatcher dispatcher; + NiceMock observer; + + ON_CALL(config, GetOfflineStorageMaximumSizeBytes()).WillByDefault(Return(32 * 4096)); + ON_CALL(config, GetMaximumRetryCount()).WillByDefault(Return(5)); + + std::ostringstream dbPath; + dbPath << GetTempDirectory() << "FlushReserveTest.db"; + std::remove(dbPath.str().c_str()); + config[CFG_STR_CACHE_FILE_PATH] = dbPath.str(); + config[CFG_INT_RAM_QUEUE_SIZE] = 1024 * 1024; // enable the in-memory queue + + OfflineStorageHandler handler(logManager, config, dispatcher); + handler.Initialize(observer); + + // timestamp <= 0 is accepted by the in-memory queue but rejected by the + // SQLite disk store, simulating a disk write failure during flush. + const size_t kCount = 5; + for (size_t i = 0; i < kCount; i++) + { + StorageRecord r("flush-id-" + std::to_string(i), "tenant-token", + EventLatency_Normal, EventPersistence_Normal, /*timestamp*/ 0, + std::vector{ 'x' }); + handler.StoreRecord(r); + } + EXPECT_EQ(handler.GetRecordCount(), kCount); + + handler.Flush(); + + // The disk rejected every record; with the fix they are returned to the + // in-memory queue rather than silently dropped. + EXPECT_EQ(handler.GetRecordCount(), kCount); + + handler.Shutdown(); + std::remove(dbPath.str().c_str()); +} From 202e8c3d67e2a6894abe94038c8224a71fd7f7a1 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 17:44:00 -0500 Subject: [PATCH 2/7] Address Copilot round-1 comments OfflineStorageTests.cpp: add explicit #include for std::remove (was relying on transitive includes), and clean up the SQLite WAL companion files (-wal/-shm/-journal) in addition to the main .db via a RemoveDbFiles() helper, so the test does not leave temp files accumulating in the temp directory. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unittests/OfflineStorageTests.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/unittests/OfflineStorageTests.cpp b/tests/unittests/OfflineStorageTests.cpp index 36da3029b..bb412f2af 100644 --- a/tests/unittests/OfflineStorageTests.cpp +++ b/tests/unittests/OfflineStorageTests.cpp @@ -8,6 +8,7 @@ #include "offline/StorageObserver.hpp" #include "NullObjects.hpp" +#include #include using namespace testing; @@ -171,6 +172,16 @@ TEST_F(OfflineStorageTests, ReleaseRecordsIsForwarded) namespace { + // Remove a SQLite db file along with its WAL-mode companion files + // (-wal/-shm/-journal), which would otherwise accumulate in the temp dir. + void RemoveDbFiles(const std::string& path) + { + std::remove(path.c_str()); + std::remove((path + "-wal").c_str()); + std::remove((path + "-shm").c_str()); + std::remove((path + "-journal").c_str()); + } + // Dispatcher that drops queued work, so flushes only run when invoked directly. class NoopTaskDispatcher : public ITaskDispatcher { @@ -200,7 +211,7 @@ TEST(OfflineStorageHandlerFlushTests, FailedDiskWriteDuringFlushReturnsRecordsTo std::ostringstream dbPath; dbPath << GetTempDirectory() << "FlushReserveTest.db"; - std::remove(dbPath.str().c_str()); + RemoveDbFiles(dbPath.str()); config[CFG_STR_CACHE_FILE_PATH] = dbPath.str(); config[CFG_INT_RAM_QUEUE_SIZE] = 1024 * 1024; // enable the in-memory queue @@ -226,5 +237,5 @@ TEST(OfflineStorageHandlerFlushTests, FailedDiskWriteDuringFlushReturnsRecordsTo EXPECT_EQ(handler.GetRecordCount(), kCount); handler.Shutdown(); - std::remove(dbPath.str().c_str()); + RemoveDbFiles(dbPath.str()); } From d79d6cf7be30368adcea42037ff55a12ef234d01 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 18:00:48 -0500 Subject: [PATCH 3/7] Address Copilot round-2 comments OfflineStorageHandler::Flush(): null-guard the m_offlineStorageMemory->GetSize() read so Flush() is safe when the RAM queue is disabled (CFG_INT_RAM_QUEUE_SIZE==0) and only disk storage exists. OfflineStorageTests.cpp: use a unique per-run temp DB filename (append getUtcSystemTimeMs()) to avoid cross-run/parallel interference; keep cleanup best-effort. Per-record StoreRecord() in Flush is kept deliberately and now documented: a batched StoreRecords() only returns a count, so on a partial failure we could not tell which records to delete vs. retry, and re-storing already-persisted records would duplicate them (events table has no unique record_id constraint). The Room batched-JNI efficiency trade-off is noted for a possible follow-up (a batch disk-store API that reports per-record success). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/offline/OfflineStorageHandler.cpp | 8 ++++++-- tests/unittests/OfflineStorageTests.cpp | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/offline/OfflineStorageHandler.cpp b/lib/offline/OfflineStorageHandler.cpp index 9b3169e7d..d68d42132 100644 --- a/lib/offline/OfflineStorageHandler.cpp +++ b/lib/offline/OfflineStorageHandler.cpp @@ -174,7 +174,7 @@ namespace MAT_NS_BEGIN { // than the handle gets replaced by nullptr in this DeferredCallbackHandle obj. m_flushHandle.Cancel(); - size_t dbSizeBeforeFlush = m_offlineStorageMemory->GetSize(); + size_t dbSizeBeforeFlush = (m_offlineStorageMemory != nullptr) ? m_offlineStorageMemory->GetSize() : 0; if ((m_offlineStorageMemory) && (dbSizeBeforeFlush > 0) && (m_offlineStorageDisk)) { // Reserve the records in memory (with a nominal lease) instead of @@ -191,7 +191,11 @@ namespace MAT_NS_BEGIN { m_offlineStorageMemory->GetAndReserveRecords(consumer, reserveLeaseMs, EventLatency_Unspecified); // Persist to disk one record at a time, tracking exactly which records - // were stored so we only drop the persisted ones from memory. + // were stored so we only drop the persisted ones from memory. This + // intentionally favors correctness over batching: a batched StoreRecords() + // only returns a count, so on a partial failure we could not tell which + // records to keep vs. retry, and re-storing already-saved records would + // duplicate them (the events table has no unique record_id constraint). std::vector persistedIds; std::vector failedIds; persistedIds.reserve(records.size()); diff --git a/tests/unittests/OfflineStorageTests.cpp b/tests/unittests/OfflineStorageTests.cpp index bb412f2af..aa828dc6a 100644 --- a/tests/unittests/OfflineStorageTests.cpp +++ b/tests/unittests/OfflineStorageTests.cpp @@ -210,7 +210,7 @@ TEST(OfflineStorageHandlerFlushTests, FailedDiskWriteDuringFlushReturnsRecordsTo ON_CALL(config, GetMaximumRetryCount()).WillByDefault(Return(5)); std::ostringstream dbPath; - dbPath << GetTempDirectory() << "FlushReserveTest.db"; + dbPath << GetTempDirectory() << "FlushReserveTest-" << PAL::getUtcSystemTimeMs() << ".db"; RemoveDbFiles(dbPath.str()); config[CFG_STR_CACHE_FILE_PATH] = dbPath.str(); config[CFG_INT_RAM_QUEUE_SIZE] = 1024 * 1024; // enable the in-memory queue From 342be951e1b7457fa853a198ed17fa5e016b00ea Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 20:30:39 -0500 Subject: [PATCH 4/7] Address Copilot round-3 comments OfflineStorage_SQLite::StoreRecord: return false when the INSERT execute() fails (was ignoring the result and returning true even on a real write failure). The Flush() reserve/confirm-delete logic in this PR relies on the disk backend reporting per-record failure; without this, a failed sqlite3_step would still be treated as persisted and dropped from memory. (This is the same one-line fix as PR #1491, included here so this PR is correct on its own; trivial overlap that resolves cleanly at merge.) OfflineStorageTests.cpp NoopTaskDispatcher: own queued tasks and delete them on Cancel()/Join()/destruction instead of dropping the Task* (PAL::scheduleTask allocates with new and expects the dispatcher to take ownership), so the helper cannot leak. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/offline/OfflineStorage_SQLite.cpp | 8 +++++++- tests/unittests/OfflineStorageTests.cpp | 24 ++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/lib/offline/OfflineStorage_SQLite.cpp b/lib/offline/OfflineStorage_SQLite.cpp index b9b2ed83d..b1c7c82b4 100644 --- a/lib/offline/OfflineStorage_SQLite.cpp +++ b/lib/offline/OfflineStorage_SQLite.cpp @@ -177,7 +177,13 @@ namespace MAT_NS_BEGIN { return false; } #endif - SqliteStatement(*m_db, m_stmtInsertEvent_id_tenant_prio_ts_data).execute(record.id, record.tenantToken, static_cast(record.latency), static_cast(record.persistence), record.timestamp, record.blob); + if (!SqliteStatement(*m_db, m_stmtInsertEvent_id_tenant_prio_ts_data).execute(record.id, record.tenantToken, static_cast(record.latency), static_cast(record.persistence), record.timestamp, record.blob)) + { + LOG_ERROR("Failed to store event %s:%s: database write failed", + tenantTokenToId(record.tenantToken).c_str(), record.id.c_str()); + m_observer->OnStorageFailed("Database write failed"); + return false; + } m_DbSizeEstimate += record.id.size() + record.tenantToken.size() + record.blob.size(); } diff --git a/tests/unittests/OfflineStorageTests.cpp b/tests/unittests/OfflineStorageTests.cpp index aa828dc6a..7c2950ca8 100644 --- a/tests/unittests/OfflineStorageTests.cpp +++ b/tests/unittests/OfflineStorageTests.cpp @@ -182,18 +182,34 @@ namespace std::remove((path + "-journal").c_str()); } - // Dispatcher that drops queued work, so flushes only run when invoked directly. + // No-op dispatcher that owns queued tasks and frees them, so flushes only + // run when invoked directly and scheduled tasks (if any) are not leaked. class NoopTaskDispatcher : public ITaskDispatcher { public: - void Join() override {} - void Queue(Task* task) override { UNREFERENCED_PARAMETER(task); } + void Join() override { clear(); } + void Queue(Task* task) override { m_tasks.push_back(task); } bool Cancel(Task* task, uint64_t waitTime = 0) override { - UNREFERENCED_PARAMETER(task); UNREFERENCED_PARAMETER(waitTime); + auto it = std::find(m_tasks.begin(), m_tasks.end(), task); + if (it != m_tasks.end()) + { + delete *it; + m_tasks.erase(it); + } return true; } + ~NoopTaskDispatcher() override { clear(); } + + private: + void clear() + { + for (auto* t : m_tasks) + delete t; + m_tasks.clear(); + } + std::vector m_tasks; }; } From 9d1b92a315e505c1b81aa3d57119de5602ee2bd6 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 20:40:56 -0500 Subject: [PATCH 5/7] Address Copilot round-4 comment: don't carry reservation lease onto disk Flush() reserves records with a non-zero lease, which stamps reservedUntil on the records handed to the consumer. SQLite ignores reservedUntil on insert, but the Room backend persists it, so freshly flushed records would be written as "reserved" for ~2 minutes and become temporarily ineligible for upload selection. Clear record.reservedUntil before the disk StoreRecord() so the in-memory lease never leaks to durable storage. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/offline/OfflineStorageHandler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/offline/OfflineStorageHandler.cpp b/lib/offline/OfflineStorageHandler.cpp index d68d42132..828488a6c 100644 --- a/lib/offline/OfflineStorageHandler.cpp +++ b/lib/offline/OfflineStorageHandler.cpp @@ -201,6 +201,11 @@ namespace MAT_NS_BEGIN { persistedIds.reserve(records.size()); for (auto& record : records) { + // The in-memory reservation lease must not be carried onto disk: + // SQLite ignores reservedUntil, but the Room backend persists it, + // which would make freshly flushed records temporarily ineligible + // for upload selection. + record.reservedUntil = 0; if (m_offlineStorageDisk->StoreRecord(record)) persistedIds.push_back(record.id); else From 123440ea084fd3c904da33a278d0268a11a0ddd2 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 20:49:57 -0500 Subject: [PATCH 6/7] Address Copilot round-5 comment: clarify test mechanism wording The test comment said it simulates a "disk write failure"; in fact StoreRecord() fails via the SQLite backend's input validation (timestamp <= 0), not a literal disk I/O error. Reword to describe the actual mechanism while noting it exercises the same Flush() failure-handling path (a failed record is returned to memory). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unittests/OfflineStorageTests.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unittests/OfflineStorageTests.cpp b/tests/unittests/OfflineStorageTests.cpp index 7c2950ca8..1bc834755 100644 --- a/tests/unittests/OfflineStorageTests.cpp +++ b/tests/unittests/OfflineStorageTests.cpp @@ -234,8 +234,10 @@ TEST(OfflineStorageHandlerFlushTests, FailedDiskWriteDuringFlushReturnsRecordsTo OfflineStorageHandler handler(logManager, config, dispatcher); handler.Initialize(observer); - // timestamp <= 0 is accepted by the in-memory queue but rejected by the - // SQLite disk store, simulating a disk write failure during flush. + // A timestamp <= 0 is accepted by the in-memory queue but rejected by the + // SQLite disk store's input validation, so its StoreRecord() returns false. + // This drives the same Flush() failure-handling path as a disk write failure + // (a failed record must be returned to memory, not dropped). const size_t kCount = 5; for (size_t i = 0; i < kCount; i++) { From ce957eeb3320556591a9fb6d744965cdbe3ac55d Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 21:05:46 -0500 Subject: [PATCH 7/7] Address Copilot round-6: simplify Flush to drain-and-re-insert-failed Replace the reserve/confirm-delete approach with the simpler drain-and-re-insert variant Copilot originally suggested. Flush() now drains the in-memory queue into a single local batch (GetRecords), persists each record, and re-inserts only the records that fail to persist. This keeps the data-loss fix (failed records are returned to memory, not dropped) while resolving two issues with the reserve approach: - Peak memory: it no longer holds two full copies of every record (reserved map + local vector) during the disk-write loop -- only a single copy is in flight. - It no longer sets a reservation lease, so there is no reservedUntil to clear and no risk of the Room backend persisting a stale reservation. Per-record StoreRecord() is still used deliberately (documented): a batched StoreRecords() only returns a count, so on partial failure we could not tell which records to re-queue, and re-storing already-saved records would duplicate them (no unique record_id constraint). The Room per-record JNI cost remains a possible follow-up that would need a public IOfflineStorage batch-with-results API. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/offline/OfflineStorageHandler.cpp | 72 +++++++++++---------------- 1 file changed, 28 insertions(+), 44 deletions(-) diff --git a/lib/offline/OfflineStorageHandler.cpp b/lib/offline/OfflineStorageHandler.cpp index 828488a6c..90f9c4a27 100644 --- a/lib/offline/OfflineStorageHandler.cpp +++ b/lib/offline/OfflineStorageHandler.cpp @@ -177,60 +177,44 @@ namespace MAT_NS_BEGIN { size_t dbSizeBeforeFlush = (m_offlineStorageMemory != nullptr) ? m_offlineStorageMemory->GetSize() : 0; if ((m_offlineStorageMemory) && (dbSizeBeforeFlush > 0) && (m_offlineStorageDisk)) { - // Reserve the records in memory (with a nominal lease) instead of - // removing them outright. The lease duration is not time-critical - // here because the records are persisted and resolved synchronously - // under m_flushLock; reserving simply keeps the originals retrievable - // so any that fail to persist can be returned to the queue. - const unsigned reserveLeaseMs = 120000; - std::vector records; - auto consumer = [&records](StorageRecord&& record) -> bool { - records.push_back(std::move(record)); - return true; - }; - m_offlineStorageMemory->GetAndReserveRecords(consumer, reserveLeaseMs, EventLatency_Unspecified); - - // Persist to disk one record at a time, tracking exactly which records - // were stored so we only drop the persisted ones from memory. This - // intentionally favors correctness over batching: a batched StoreRecords() - // only returns a count, so on a partial failure we could not tell which - // records to keep vs. retry, and re-storing already-saved records would - // duplicate them (the events table has no unique record_id constraint). - std::vector persistedIds; - std::vector failedIds; - persistedIds.reserve(records.size()); + // Drain the in-memory queue into a local batch. Records are removed + // from memory here; any that fail to persist below are re-inserted, so + // a disk write failure does not silently lose events. Draining (rather + // than reserving) keeps only a single copy of each record in flight and + // avoids stamping a reservation lease that the Room backend would + // persist to disk. + auto records = m_offlineStorageMemory->GetRecords(false, EventLatency_Unspecified); + + // Persist one record at a time so we know exactly which succeeded. A + // batched StoreRecords() only returns a count, so on a partial failure + // we could not tell which records to re-queue, and re-storing + // already-saved records would duplicate them (the events table has no + // unique record_id constraint). + size_t totalSaved = 0; + size_t totalFailed = 0; for (auto& record : records) { - // The in-memory reservation lease must not be carried onto disk: - // SQLite ignores reservedUntil, but the Room backend persists it, - // which would make freshly flushed records temporarily ineligible - // for upload selection. - record.reservedUntil = 0; if (m_offlineStorageDisk->StoreRecord(record)) - persistedIds.push_back(record.id); + { + ++totalSaved; + } else - failedIds.push_back(record.id); + { + // Return the record to the in-memory queue for retry on a + // subsequent flush instead of dropping it. + ++totalFailed; + m_offlineStorageMemory->StoreRecord(record); + } } - HttpHeaders dummy; - bool fromMemory = true; - - // Persisted records are now safely on disk: drop them from memory. - if (!persistedIds.empty()) - m_offlineStorageMemory->DeleteRecords(persistedIds, dummy, fromMemory); - - // Records that failed to persist are returned to the in-memory queue - // so a partial or total disk write failure does not silently lose - // events; they are retried on a subsequent flush. - if (!failedIds.empty()) + if (totalFailed > 0) { - LOG_WARN("Flush: %zu of %zu records failed to persist to disk; returning them to the queue for retry", - failedIds.size(), records.size()); - m_offlineStorageMemory->ReleaseRecords(failedIds, false, dummy, fromMemory); + LOG_WARN("Flush: %zu of %zu records failed to persist to disk; returned to the queue for retry", + totalFailed, records.size()); } // Notify event listener about the records cached - OnStorageRecordsSaved(persistedIds.size()); + OnStorageRecordsSaved(totalSaved); if (m_offlineStorageMemory->GetSize() > dbSizeBeforeFlush) {