Skip to content

Commit aa6b186

Browse files
committed
fix(llama): Metal crash on exit, KV cache reuse, thread tuning, date context optimization
- Fix ggml_metal_device_free crash on quit by using _exit(0) after saving conversations, skipping C++ static destructors (known llama.cpp issue) - Fix FolderManager() created in SwiftUI Menu body causing 800+ disk reads - Move date/time from system prompt to userContext block for stable system prompt (enables KV cache prefix reuse for pure Transformer models) - Add KV cache prefix reuse for non-hybrid/non-recurrent models - Disable KV cache prefix reuse for hybrid models (Qwen3.5 Mamba+Transformer) where recurrent state buffer cannot be partially cleared - Tune thread counts for Apple Silicon Metal offload (fewer threads, less contention when GPU does the heavy lifting) - Enable flash attention, q8_0 KV cache quantization, ubatch sizing - Fix thread assignment: n_threads for generation (fewer), n_threads_batch for prompt processing (more) - Add EOG text detection for MLX streaming (gemma <end_of_turn> etc.) - Add strong reference to EndpointManager in AppDelegate - Handle unknown model endpoint types gracefully
1 parent 009b435 commit aa6b186

14 files changed

Lines changed: 330 additions & 124 deletions

Sources/APIFramework/EndpointManager.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,11 @@ extension EndpointManager: ConversationEngine.AIProviderProtocol {
15171517
}
15181518

15191519
logger.info("CLEANUP: Provider cleanup complete")
1520+
1521+
/// Free llama.cpp backend after all contexts are destroyed.
1522+
/// Prevents ggml_metal_device_free crash on exit.
1523+
LlamaContext.freeBackend()
1524+
logger.info("CLEANUP: llama.cpp backend freed")
15201525
}
15211526
}
15221527

Sources/APIFramework/GitHubCopilotResponsesModels.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ public enum ModelSupportedEndpoint: String, Codable {
3030
case chatCompletions = "/chat/completions"
3131
case responses = "/responses"
3232
case messages = "/v1/messages"
33+
case wsResponses = "ws:/responses"
34+
35+
public init(from decoder: Decoder) throws {
36+
let container = try decoder.singleValueContainer()
37+
let rawValue = try container.decode(String.self)
38+
self = ModelSupportedEndpoint(rawValue: rawValue) ?? .unknown
39+
}
40+
41+
case unknown = "_unknown"
3342
}
3443

3544
/// GitHub Copilot model information.

Sources/APIFramework/LibLlama.swift

Lines changed: 124 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ actor LlamaContext {
6969

7070
/// Cancellation flag - set to true to abort generation.
7171
private var isCancelled: Bool = false
72+
/// Whether this model uses recurrent/hybrid state (Mamba/SSM layers).
73+
/// KV cache prefix reuse is unsafe for these models because the RS buffer
74+
/// cannot be partially cleared.
75+
var hasRecurrentState: Bool {
76+
return llama_model_is_recurrent(model) || llama_model_is_hybrid(model)
77+
}
78+
private nonisolated(unsafe) var isDestroyed: Bool = false
7279

7380
/// Performance tracking.
7481
private var generationStartTime: Date?
@@ -78,6 +85,9 @@ actor LlamaContext {
7885
/// Some models generate EOG tokens as text instead of special tokens.
7986
private var accumulatedText: String = ""
8087

88+
/// KV cache reuse: store previous prompt tokens to find common prefix.
89+
private var previousPromptTokens: [llama_token] = []
90+
8191
/// Max tokens limit for generation (set before calling completion_loop).
8292
/// This is the actual limit for how many tokens to generate, separate from context size.
8393
var maxTokensLimit: Int = 4096
@@ -120,18 +130,21 @@ actor LlamaContext {
120130
deinit {
121131
llamaLogger.info("LlamaContext deinit: Cleaning up resources")
122132

123-
/// Free resources in the correct order (inverse of allocation)
124-
/// Do NOT call llama_backend_free() here - it should only be called once at app shutdown
125-
/// See AppDelegate.applicationWillTerminate() for global cleanup
126-
127-
llama_sampler_free(sampling)
128-
llama_batch_free(batch)
129-
llama_free(context)
130-
llama_model_free(model)
133+
if !isDestroyed {
134+
llama_sampler_free(sampling)
135+
llama_batch_free(batch)
136+
llama_free(context)
137+
llama_model_free(model)
138+
}
131139

132140
llamaLogger.info("LlamaContext cleaned up successfully")
133141
}
134142

143+
/// Call at app shutdown after all contexts are destroyed.
144+
public static func freeBackend() {
145+
llama_backend_free()
146+
}
147+
135148
// MARK: - Factory Method
136149

137150
/// Detect quantization type from model filename or metadata Returns estimated bytes per token for KV cache based on quantization.
@@ -262,26 +275,15 @@ actor LlamaContext {
262275
) -> (promptThreads: Int32, generationThreads: Int32) {
263276
let modelSizeGB = modelSize / (1024*1024*1024)
264277

265-
// Prompt processing: Use ALL available cores (batch processing benefits from parallelism)
266-
let promptThreads = max(1, totalCores - 1) // Leave 1 for OS
267-
268-
// Token generation: Adjust based on model size
269-
// Smaller models: more threads (compute-bound)
270-
// Larger models: fewer threads (memory bandwidth-bound)
271-
let genThreads: Int
272-
if modelSizeGB >= 40 {
273-
// 70B models: Limited by memory bandwidth, not compute
274-
genThreads = min(4, totalCores / 2)
275-
llamaLogger.info("THREAD_COUNT: Large model (>=40GB), gen_threads=\(genThreads)")
276-
} else if modelSizeGB >= 20 {
277-
// 30B models: Moderate parallelism
278-
genThreads = min(8, (totalCores * 2) / 3)
279-
llamaLogger.info("THREAD_COUNT: Medium model (20-40GB), gen_threads=\(genThreads)")
280-
} else {
281-
// 7B models: Can benefit from high parallelism
282-
genThreads = min(12, totalCores - 2)
283-
llamaLogger.info("THREAD_COUNT: Small model (<20GB), gen_threads=\(genThreads)")
284-
}
278+
// On Apple Silicon with Metal GPU offload (all layers on GPU):
279+
// - CPU threads mostly marshal data to/from GPU, not doing compute
280+
// - Too many threads causes contention and HURTS performance
281+
// - Generation (single token): 1-2 threads (GPU-bound, not CPU-bound)
282+
// - Prompt (batch): performance cores only (moderate parallelism helps batch prep)
283+
let perfCores = max(1, totalCores / 2) // Apple Silicon: ~half cores are performance
284+
let promptThreads = min(perfCores, 8)
285+
let genThreads = min(perfCores, 4)
286+
llamaLogger.info("THREAD_COUNT: Model \(modelSizeGB)GB, perfCores=\(perfCores)")
285287

286288
llamaLogger.info("THREAD_COUNT: Final threads - prompt=\(promptThreads), generation=\(genThreads) (total_cores=\(totalCores))")
287289

@@ -377,10 +379,11 @@ actor LlamaContext {
377379
llamaLogger.info("Using adaptive threads: prompt=\(promptThreads), generation=\(generationThreads) (total_cores=\(totalCores))")
378380

379381
var ctx_params = llama_context_default_params()
380-
ctx_params.n_ctx = UInt32(n_ctx)
381-
ctx_params.n_batch = UInt32(n_batch)
382-
ctx_params.n_threads = promptThreads // Used for prompt evaluation
383-
ctx_params.n_threads_batch = generationThreads // Used for token generation
382+
ctx_params.n_ctx = UInt32(n_ctx)
383+
ctx_params.n_batch = UInt32(n_batch)
384+
ctx_params.n_ubatch = UInt32(min(n_batch, 512)) // Physical micro-batch size for better memory locality
385+
ctx_params.n_threads = generationThreads // Single-token generation (memory-bandwidth bound, fewer threads)
386+
ctx_params.n_threads_batch = promptThreads // Batch/prompt processing (compute-bound, more threads)
384387

385388
/// CRITICAL FIX: Explicitly initialize samplers to NULL to prevent segfault
386389
/// Recent llama.cpp versions validate sampler chains during context init.
@@ -389,10 +392,25 @@ actor LlamaContext {
389392
ctx_params.samplers = nil
390393
ctx_params.n_samplers = 0
391394

392-
/// PERFORMANCE: Offload KV cache to GPU (LMStudio "Offload KV Cache to GPU") This stores the key/value cache on GPU instead of CPU RAM Dramatically improves performance for long contexts.
395+
/// PERFORMANCE: Flash Attention - significantly reduces memory usage and improves speed.
396+
/// Uses optimized attention kernels that avoid materializing the full attention matrix.
397+
ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED
398+
399+
/// PERFORMANCE: KV cache quantization - q8_0 for both K and V tensors.
400+
/// Reduces KV cache memory by ~50% vs f16 with minimal quality loss.
401+
/// Equivalent to --cache-type-k q8_0 --cache-type-v q8_0 in llama-server.
402+
ctx_params.type_k = GGML_TYPE_Q8_0
403+
ctx_params.type_v = GGML_TYPE_Q8_0
404+
405+
/// PERFORMANCE: Offload KV cache to GPU (LMStudio "Offload KV Cache to GPU").
406+
/// Stores the key/value cache on GPU instead of CPU RAM.
393407
ctx_params.offload_kqv = true
394408

395-
llamaLogger.info("SUCCESS: PERFORMANCE_OPTIMIZATIONS: mmap=true, mlock=true, offload_kqv=true (KV cache on GPU)")
409+
/// PERFORMANCE: Enable full SWA cache for sliding window attention models.
410+
/// Prevents quality degradation with models like Gemma, Mistral that use SWA.
411+
ctx_params.swa_full = true
412+
413+
llamaLogger.info("SUCCESS: PERFORMANCE_OPTIMIZATIONS: mmap=true, mlock=true, offload_kqv=true, flash_attn=true, kv_type=q8_0, swa_full=true, ubatch=\(ctx_params.n_ubatch)")
396414

397415
let context = llama_init_from_model(model, ctx_params)
398416
guard let context else {
@@ -445,20 +463,68 @@ actor LlamaContext {
445463
tokens_list = tokenize(text: text, add_bos: true)
446464
temporary_invalid_cchars = []
447465

448-
/// Clear KV cache before each new completion Without this, llama.cpp fails with "sequence positions remain consecutive" error because it thinks sequence 0 already has tokens from previous request Use llama_memory_seq_rm with positions -1 to -1 to clear entire sequence.
449-
let memory = llama_get_memory(context)
450-
_ = llama_memory_seq_rm(memory, 0, -1, -1)
451-
452466
/// Reset completion state.
453467
is_done = false
454468
n_decode = 0
455-
n_cur = 0
456469
accumulatedText = ""
457470

458471
/// Start performance tracking.
459472
generationStartTime = Date()
460473
tokensGenerated = 0
461474

475+
/// KV cache prefix reuse: find how many tokens match the previous prompt.
476+
/// This avoids reprocessing the system prompt + tools on every message.
477+
/// NOTE: Prefix reuse is disabled for hybrid Mamba/Transformer models (qwen35, etc.)
478+
/// because their recurrent state (RS buffer) cannot be partially cleared.
479+
/// The RS buffer tracks state across all positions and becomes inconsistent
480+
/// after llama_memory_seq_rm, causing decode failures.
481+
let hasRecurrentState = llama_model_is_recurrent(model) || llama_model_is_hybrid(model)
482+
var commonPrefix = 0
483+
if !hasRecurrentState {
484+
let minLen = min(previousPromptTokens.count, tokens_list.count)
485+
while commonPrefix < minLen && previousPromptTokens[commonPrefix] == tokens_list[commonPrefix] {
486+
commonPrefix += 1
487+
}
488+
}
489+
490+
if commonPrefix == 0 && previousPromptTokens.count > 0 {
491+
/// Diagnostic: show first divergent tokens to understand why prefix matching fails
492+
let prevFirst = previousPromptTokens.prefix(10).map { String($0) }.joined(separator: ",")
493+
let currFirst = tokens_list.prefix(10).map { String($0) }.joined(separator: ",")
494+
llamaLogger.warning("KV_CACHE_DEBUG: Prefix mismatch at token 0! prev_count=\(previousPromptTokens.count) curr_count=\(tokens_list.count)")
495+
llamaLogger.warning("KV_CACHE_DEBUG: prev_first10=[\(prevFirst)]")
496+
llamaLogger.warning("KV_CACHE_DEBUG: curr_first10=[\(currFirst)]")
497+
/// Find first difference
498+
let diagLen = min(100, min(previousPromptTokens.count, tokens_list.count))
499+
for i in 0..<diagLen {
500+
if previousPromptTokens[i] != tokens_list[i] {
501+
llamaLogger.warning("KV_CACHE_DEBUG: First difference at token \(i)")
502+
break
503+
}
504+
}
505+
} else if commonPrefix > 0 && commonPrefix < previousPromptTokens.count {
506+
llamaLogger.info("KV_CACHE_DEBUG: Partial match \(commonPrefix)/\(previousPromptTokens.count), divergence at token \(commonPrefix)")
507+
}
508+
509+
let memory = llama_get_memory(context)
510+
if commonPrefix > 0 {
511+
/// Remove only the tokens AFTER the common prefix from KV cache.
512+
_ = llama_memory_seq_rm(memory, 0, Int32(commonPrefix), -1)
513+
n_cur = Int32(commonPrefix)
514+
llamaLogger.info("KV_CACHE_REUSE: Reusing \(commonPrefix) of \(previousPromptTokens.count) cached tokens, processing \(tokens_list.count - commonPrefix) new tokens")
515+
} else {
516+
/// No common prefix - clear entire cache.
517+
_ = llama_memory_seq_rm(memory, 0, -1, -1)
518+
n_cur = 0
519+
llamaLogger.info("KV_CACHE_REUSE: No prefix match, processing all \(tokens_list.count) tokens")
520+
}
521+
522+
/// Store tokens for next call's prefix comparison.
523+
previousPromptTokens = tokens_list
524+
525+
/// Skip ahead to only process new tokens.
526+
let startIndex = Int(n_cur)
527+
462528
let n_ctx = llama_n_ctx(context)
463529
let estimatedOutputTokens = min(maxTokensLimit, 4096) /// Estimate for KV cache planning
464530
let n_kv_req = tokens_list.count + estimatedOutputTokens
@@ -470,12 +536,12 @@ actor LlamaContext {
470536
}
471537

472538
/// Process prompt in batches if it exceeds batch size batch was initialized with batchSize (2048), but prompt might be larger Need to decode in chunks to avoid overflow.
473-
llamaLogger.info("Processing prompt with \(tokens_list.count) tokens, batch_size=\(batchSize)")
539+
llamaLogger.info("Processing prompt with \(tokens_list.count - startIndex) new tokens (of \(tokens_list.count) total), batch_size=\(batchSize)")
474540

475541
/// Adaptive batch sizing based on prompt length and context capacity The crash occurs when KV cache runs out of memory slots Root cause: Processing large prompts (8611 tokens) exhausts KV cache after 2 batches Solution: Reduce batch size dynamically when approaching context limits.
476542

477543
let maxContextTokens = Int32(contextSize)
478-
var tokenIndex = 0
544+
var tokenIndex = startIndex
479545
var consecutiveDecodeFailures = 0
480546
let maxConsecutiveFailures = 3
481547

@@ -689,9 +755,26 @@ actor LlamaContext {
689755
maxTokensLimit = limit
690756
}
691757

758+
/// Explicitly free all Metal/llama resources. Call before app exit to avoid
759+
/// static destructor crashes in ggml_metal_device_free.
760+
func destroy() {
761+
llamaLogger.info("LlamaContext destroy: Explicitly freeing all resources")
762+
previousPromptTokens = []
763+
llama_sampler_free(sampling)
764+
llama_batch_free(batch)
765+
llama_free(context)
766+
llama_model_free(model)
767+
/// Nil out the pointers so deinit doesn't double-free.
768+
/// (These are let properties in the actor, so we use a flag instead.)
769+
isDestroyed = true
770+
llamaLogger.info("LlamaContext destroy: Resources freed successfully")
771+
}
772+
692773
func clear() {
774+
guard !isDestroyed else { return }
693775
tokens_list.removeAll()
694776
temporary_invalid_cchars.removeAll()
777+
previousPromptTokens.removeAll()
695778

696779
/// Reset position tracking - critical for conversation switching.
697780
n_cur = 0

Sources/APIFramework/LlamaProvider.swift

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,13 @@ public class LlamaProvider: AIProvider {
402402
continuation.yield(finalChunk)
403403

404404
/// Clear context for next request.
405-
await context.clear()
405+
/// Preserve KV cache for prefix reuse on next message.
406+
/// Only safe for non-recurrent (pure Transformer) models.
407+
if await !context.hasRecurrentState {
408+
await context.resetGeneration()
409+
} else {
410+
await context.clear()
411+
}
406412

407413
continuation.finish()
408414

@@ -488,7 +494,7 @@ public class LlamaProvider: AIProvider {
488494
await context.cancel()
489495

490496
providerLogger.info("UNLOAD_MODEL: Freeing llama.cpp context and model resources")
491-
await context.clear()
497+
await context.destroy()
492498

493499
/// OPTIMIZATION: Clear conversation caches on model unload.
494500
clearAllConversationCaches()

0 commit comments

Comments
 (0)