From ae1bb809907fceec072201d10a75de435313c6f8 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 15:04:21 -0500 Subject: [PATCH 1/4] Harden runtime task execution and offline-storage edge cases These are latent correctness/robustness fixes found during a repo-wide review. WorkerThread.cpp: wrap the queued-task invocation `(*item)()` in try/catch. A task runs arbitrary work (storage I/O, HTTP encode, user DebugEventListener callbacks); an exception escaping the loop unwinds out of the thread entry function and calls std::terminate, killing the host process. Contain it and log. TaskDispatcher_CAPI.cpp: same exception barrier around `(*m_task)()` in Task_CAPI::OnCallback(), which runs on the host's external dispatcher thread. capi.cpp (mat_open_core): on the EALREADY path (a guest instance with the same config is already open) set ctx->handle to the existing instance's handle before returning, so the caller is not left with an uninitialized/stale handle. OfflineStorageHandler.cpp (Flush): null-guard m_offlineStorageMemory->GetSize() read; the subsequent block already null-checks the pointer, so reading it first was inconsistent and a potential null deref. OfflineStorageHandler (StoreRecord): the per-instance RAM cache size limit was held in a function-local `static`, so the first LogManager's CFG_INT_RAM_QUEUE_SIZE leaked to every other LogManager instance. Compute it once per instance in Initialize() into a member (preserving the original "compute once" intent). Tests: - PalTests.WorkerThreadContainsThrowingTask: a task that throws std/non-std exceptions does not tear down the worker thread; follow-up tasks still run. - TaskDispatcherCAPITests.ExecuteCallbackThatThrowsIsContained: a throwing CAPI task callback does not propagate back into the host dispatcher thread. Files changed: - lib/pal/WorkerThread.cpp - lib/pal/TaskDispatcher_CAPI.cpp - lib/api/capi.cpp - lib/offline/OfflineStorageHandler.cpp - lib/offline/OfflineStorageHandler.hpp - tests/unittests/PalTests.cpp - tests/unittests/TaskDispatcherCAPITests.cpp Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/api/capi.cpp | 5 ++- lib/offline/OfflineStorageHandler.cpp | 13 ++++--- lib/offline/OfflineStorageHandler.hpp | 1 + lib/pal/TaskDispatcher_CAPI.cpp | 8 +++- lib/pal/WorkerThread.cpp | 14 ++++++- tests/unittests/PalTests.cpp | 41 +++++++++++++++++++++ tests/unittests/TaskDispatcherCAPITests.cpp | 21 +++++++++++ 7 files changed, 95 insertions(+), 8 deletions(-) diff --git a/lib/api/capi.cpp b/lib/api/capi.cpp index 916c4ebda..ef683e5cb 100644 --- a/lib/api/capi.cpp +++ b/lib/api/capi.cpp @@ -91,7 +91,10 @@ evt_status_t mat_open_core( { if (client->ctx_data == config) { - // Guest instance with the same config is already open + // Guest instance with the same config is already open. + // Return its handle so the caller still gets a usable handle + // (rather than leaving ctx->handle uninitialized). + ctx->handle = code; return EALREADY; } // hash code is assigned to another client, increment and retry diff --git a/lib/offline/OfflineStorageHandler.cpp b/lib/offline/OfflineStorageHandler.cpp index 9049339c4..52ce15515 100644 --- a/lib/offline/OfflineStorageHandler.cpp +++ b/lib/offline/OfflineStorageHandler.cpp @@ -33,6 +33,7 @@ namespace MAT_NS_BEGIN { m_shutdownStarted(false), m_memoryDbSize(0), m_queryDbSize(0), + m_cacheMemorySizeLimitInBytes(0), m_isStorageFullNotificationSend(false) { // TODO: [MG] - OfflineStorage_SQLite.cpp is performing similar checks @@ -83,7 +84,7 @@ namespace MAT_NS_BEGIN { void OfflineStorageHandler::Initialize(IOfflineStorageObserver& observer) { m_observer = &observer; - uint32_t cacheMemorySizeLimitInBytes = m_config[CFG_INT_RAM_QUEUE_SIZE]; + m_cacheMemorySizeLimitInBytes = m_config[CFG_INT_RAM_QUEUE_SIZE]; m_offlineStorageDisk = OfflineStorageFactory::Create(m_logManager, m_config); if (m_offlineStorageDisk) @@ -94,7 +95,7 @@ namespace MAT_NS_BEGIN { // TODO: [MG] - consider passing m_offlineStorageDisk to m_offlineStorageMemory, // so that the Flush() op on memory storage leads to saving unflushed events to // disk. - if (cacheMemorySizeLimitInBytes > 0) + if (m_cacheMemorySizeLimitInBytes > 0) { m_offlineStorageMemory.reset(new MemoryStorage(m_logManager, m_config)); m_offlineStorageMemory->Initialize(*this); @@ -174,7 +175,7 @@ namespace MAT_NS_BEGIN { // than the handle gets replaced by nullptr in this DeferredCallbackHandle obj. m_flushHandle.Cancel(); - size_t dbSizeBeforeFlush = m_offlineStorageMemory->GetSize(); + size_t dbSizeBeforeFlush = (m_offlineStorageMemory != nullptr) ? m_offlineStorageMemory->GetSize() : 0; if ((m_offlineStorageMemory) && (dbSizeBeforeFlush > 0) && (m_offlineStorageDisk)) { // This will block on and then take a lock for the duration of this move, and @@ -233,8 +234,10 @@ namespace MAT_NS_BEGIN { return false; } - // Check cache size only once at start - static uint32_t cacheMemorySizeLimitInBytes = m_config[CFG_INT_RAM_QUEUE_SIZE]; + // Cache size limit is per-instance config computed once in Initialize(); + // it must NOT be a function-local static, which would share the first + // LogManager's value with every other LogManager instance. + uint32_t cacheMemorySizeLimitInBytes = m_cacheMemorySizeLimitInBytes; if (nullptr != m_offlineStorageMemory && !m_shutdownStarted) { diff --git a/lib/offline/OfflineStorageHandler.hpp b/lib/offline/OfflineStorageHandler.hpp index e7bdce4cb..3311d6b38 100644 --- a/lib/offline/OfflineStorageHandler.hpp +++ b/lib/offline/OfflineStorageHandler.hpp @@ -92,6 +92,7 @@ namespace MAT_NS_BEGIN { unsigned m_memoryDbSize; unsigned m_memoryDbSizeNotificationLimit; unsigned m_queryDbSize; + uint32_t m_cacheMemorySizeLimitInBytes; bool m_isStorageFullNotificationSend; protected: diff --git a/lib/pal/TaskDispatcher_CAPI.cpp b/lib/pal/TaskDispatcher_CAPI.cpp index f8e432c74..653e83382 100644 --- a/lib/pal/TaskDispatcher_CAPI.cpp +++ b/lib/pal/TaskDispatcher_CAPI.cpp @@ -37,7 +37,13 @@ namespace PAL_NS_BEGIN { void OnCallback() { if (m_task) { - (*m_task)(); + // The task is host/user code running on the external dispatcher's + // thread; an exception escaping here would terminate the process. + try { + (*m_task)(); + } + catch (...) { + } } ReleaseItem(); } diff --git a/lib/pal/WorkerThread.cpp b/lib/pal/WorkerThread.cpp index 2bdbf6c67..3275de9a8 100644 --- a/lib/pal/WorkerThread.cpp +++ b/lib/pal/WorkerThread.cpp @@ -238,7 +238,19 @@ namespace PAL_NS_BEGIN { // Item wasn't cancelled before it could be executed if (self->m_itemInProgress != nullptr) { LOG_TRACE("%10llu Execute item=%p type=%s\n", wakeupCount, item.get(), item.get()->TypeName.c_str() ); - (*item)(); + // A task can run arbitrary work (storage I/O, HTTP encode, and + // user DebugEventListener callbacks). An exception escaping here + // would unwind out of the thread entry function and call + // std::terminate, killing the host process. Contain it. + try { + (*item)(); + } + catch (const std::exception& ex) { + LOG_ERROR("Unhandled exception in worker task: %s", ex.what()); + } + catch (...) { + LOG_ERROR("Unhandled non-standard exception in worker task"); + } self->m_itemInProgress = nullptr; } diff --git a/tests/unittests/PalTests.cpp b/tests/unittests/PalTests.cpp index 0bf8a06a3..1890bd82a 100644 --- a/tests/unittests/PalTests.cpp +++ b/tests/unittests/PalTests.cpp @@ -5,9 +5,13 @@ #include "common/Common.hpp" #include "pal/PseudoRandomGenerator.hpp" +#include "pal/TaskDispatcher.hpp" +#include "pal/WorkerThread.hpp" #include "Version.hpp" +#include #include +#include #include #ifdef HAVE_MAT_LOGGING @@ -180,6 +184,43 @@ TEST_F(PalTests, SdkVersion) EXPECT_THAT(PAL::getSdkVersion(), Eq(v)); } +namespace +{ + class ThrowingTaskHelper + { + public: + void ThrowStdException() { throw std::runtime_error("worker task boom"); } + void ThrowNonStdException() { throw 123; } + void Signal(std::atomic* ran) { ran->store(true); } + }; +} + +// A task throwing an exception must be contained by the worker thread loop; +// otherwise the exception unwinds out of the thread entry function and calls +// std::terminate, killing the host process. +TEST_F(PalTests, WorkerThreadContainsThrowingTask) +{ + auto dispatcher = PAL::WorkerThreadFactory::Create(); + ThrowingTaskHelper helper; + std::atomic ranAfterStdThrow(false); + std::atomic ranAfterNonStdThrow(false); + + PAL::dispatchTask(dispatcher.get(), &helper, &ThrowingTaskHelper::ThrowStdException); + PAL::dispatchTask(dispatcher.get(), &helper, &ThrowingTaskHelper::Signal, &ranAfterStdThrow); + + PAL::dispatchTask(dispatcher.get(), &helper, &ThrowingTaskHelper::ThrowNonStdException); + PAL::dispatchTask(dispatcher.get(), &helper, &ThrowingTaskHelper::Signal, &ranAfterNonStdThrow); + + // Wait for the follow-up tasks to run, proving the thread survived each throw. + for (int i = 0; i < 500 && !(ranAfterStdThrow.load() && ranAfterNonStdThrow.load()); ++i) + PAL::sleep(10); + + EXPECT_TRUE(ranAfterStdThrow.load()); + EXPECT_TRUE(ranAfterNonStdThrow.load()); + + dispatcher->Join(); +} + #ifdef HAVE_MAT_LOGGING class LogInitTest : public Test { diff --git a/tests/unittests/TaskDispatcherCAPITests.cpp b/tests/unittests/TaskDispatcherCAPITests.cpp index 0867ad046..b227deb13 100644 --- a/tests/unittests/TaskDispatcherCAPITests.cpp +++ b/tests/unittests/TaskDispatcherCAPITests.cpp @@ -9,6 +9,8 @@ #include "pal/typename.hpp" #include "mat.h" +#include + using namespace testing; using namespace MAT; using namespace PAL; @@ -227,3 +229,22 @@ TEST(TaskDispatcherCAPITests, Join) EXPECT_EQ(wasJoined, true); } +TEST(TaskDispatcherCAPITests, ExecuteCallbackThatThrowsIsContained) +{ + TaskDispatcher_CAPI taskDispatcher(&OnTaskDispatcherQueue, &OnTaskDispatcherCancel, &OnTaskDispatcherJoin); + + AutoTestHelper testHelper; + testHelper->SetShouldExecute(true); + + bool wasExecuted = false; + testHelper->SetCallbackValidation([&wasExecuted](int /*param1*/, int /*param2*/) { + wasExecuted = true; + throw std::runtime_error("task threw"); + }); + + // The dispatcher must contain the exception so it never escapes back into + // the host's dispatcher thread (which would terminate the process). + EXPECT_NO_THROW(dispatchTask(&taskDispatcher, testHelper.get(), &TestHelper::Callback, 10 /*param1*/, 20 /*param2*/)); + EXPECT_EQ(wasExecuted, true); +} + From f913b5de66093daeb94dc7cc82578827c5051c58 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 16:44:03 -0500 Subject: [PATCH 2/4] Address Copilot round-1 comments capi.cpp (mat_open_core, EALREADY path): also set ctx->result to match the returned status. The success path sets ctx->result before returning; the EALREADY early-return set ctx->handle but left ctx->result stale, inconsistent with the other entrypoints. TaskDispatcher_CAPI.cpp (Task_CAPI::OnCallback): log the contained exception instead of swallowing it silently, mirroring WorkerThread, so host apps/SDK logs can diagnose why a queued task failed. Added include for std::exception. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/api/capi.cpp | 4 +++- lib/pal/TaskDispatcher_CAPI.cpp | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/api/capi.cpp b/lib/api/capi.cpp index ef683e5cb..688e6d95d 100644 --- a/lib/api/capi.cpp +++ b/lib/api/capi.cpp @@ -93,8 +93,10 @@ evt_status_t mat_open_core( { // Guest instance with the same config is already open. // Return its handle so the caller still gets a usable handle - // (rather than leaving ctx->handle uninitialized). + // (rather than leaving ctx->handle uninitialized), and set + // ctx->result to match the returned status like the other paths. ctx->handle = code; + ctx->result = static_cast(EALREADY); return EALREADY; } // hash code is assigned to another client, increment and retry diff --git a/lib/pal/TaskDispatcher_CAPI.cpp b/lib/pal/TaskDispatcher_CAPI.cpp index 653e83382..e75ee1924 100644 --- a/lib/pal/TaskDispatcher_CAPI.cpp +++ b/lib/pal/TaskDispatcher_CAPI.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -39,10 +40,15 @@ namespace PAL_NS_BEGIN { if (m_task) { // The task is host/user code running on the external dispatcher's // thread; an exception escaping here would terminate the process. + // Log it (mirroring WorkerThread) instead of swallowing silently. try { (*m_task)(); } + catch (const std::exception& ex) { + LOG_ERROR("Unhandled exception in CAPI task: %s", ex.what()); + } catch (...) { + LOG_ERROR("Unhandled non-standard exception in CAPI task"); } } ReleaseItem(); From 0606ada855bb381814479ac09a44feec0844401c Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 17:18:20 -0500 Subject: [PATCH 3/4] Address Copilot round-2 comments WorkerThread.cpp: add explicit #include ; the round-1 catch block uses std::exception but the TU only relied on transitive includes. capi.cpp (mat_open_core): the early error returns (invalid config, and the two HttpClient/TaskDispatcher creation catch blocks) left ctx->result and ctx->handle untouched. Since the public mat.h inline helpers return ctx.handle after calling in, callers could observe a stale/uninitialized handle on error. Set ctx->result = EFAULT and ctx->handle = 0 on all three paths (ctx is guaranteed non-null by mat_open / mat_open_with_params). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/api/capi.cpp | 10 +++++++++- lib/pal/WorkerThread.cpp | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/api/capi.cpp b/lib/api/capi.cpp index 688e6d95d..3d7799fd5 100644 --- a/lib/api/capi.cpp +++ b/lib/api/capi.cpp @@ -77,7 +77,11 @@ evt_status_t mat_open_core( { if ((config == nullptr) || (config[0] == 0)) { - // Invalid configuration + // Invalid configuration. ctx is guaranteed non-null by the callers + // (mat_open / mat_open_with_params); set result and a known-invalid + // handle so callers don't observe a stale ctx->handle on error. + ctx->result = static_cast(EFAULT); + ctx->handle = 0; return EFAULT; } @@ -148,6 +152,8 @@ evt_status_t mat_open_core( } catch (...) { + ctx->result = static_cast(EFAULT); + ctx->handle = 0; return EFAULT; } } @@ -163,6 +169,8 @@ evt_status_t mat_open_core( } catch (...) { + ctx->result = static_cast(EFAULT); + ctx->handle = 0; return EFAULT; } } diff --git a/lib/pal/WorkerThread.cpp b/lib/pal/WorkerThread.cpp index 3275de9a8..3adfb9e61 100644 --- a/lib/pal/WorkerThread.cpp +++ b/lib/pal/WorkerThread.cpp @@ -6,6 +6,8 @@ #include "pal/WorkerThread.hpp" #include "pal/PAL.hpp" +#include + #if defined(MATSDK_PAL_CPP11) || defined(MATSDK_PAL_WIN32) /* Maximum scheduler interval for SDK is 1 hour required for clamping in case of monotonic clock drift */ From 5eafb16762a8d6c038c7017e81aeef967dc9c1f2 Mon Sep 17 00:00:00 2001 From: Bhagirath Mehta Date: Mon, 22 Jun 2026 17:27:49 -0500 Subject: [PATCH 4/4] Address Copilot round-3 comments capi.cpp (mat_open_core): if HttpClient_CAPI or TaskDispatcher_CAPI construction threw, clients[code] had already been populated (config + ctx_data assigned), but the catch returned EFAULT without removing it. That left an orphaned, half- initialized entry in the global clients map, so a later open with the same config would match ctx_data and wrongly return EALREADY. Call remove_client(code) on both creation-failure paths before returning EFAULT. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/api/capi.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/api/capi.cpp b/lib/api/capi.cpp index 3d7799fd5..574a1f7eb 100644 --- a/lib/api/capi.cpp +++ b/lib/api/capi.cpp @@ -152,6 +152,9 @@ evt_status_t mat_open_core( } catch (...) { + // Roll back the partially-populated client so a later open with the + // same config does not find stale half-initialized state. + remove_client(code); ctx->result = static_cast(EFAULT); ctx->handle = 0; return EFAULT; @@ -169,6 +172,9 @@ evt_status_t mat_open_core( } catch (...) { + // Roll back the partially-populated client so a later open with the + // same config does not find stale half-initialized state. + remove_client(code); ctx->result = static_cast(EFAULT); ctx->handle = 0; return EFAULT;