|
1 | 1 | { |
2 | 2 | "releases": [ |
| 3 | + { |
| 4 | + "version": "20260418.1", |
| 5 | + "release_date": "April 18, 2026", |
| 6 | + "introduction": "This release overhauls local model performance with optimized Metal threading, flash attention, KV cache quantization, and fixes the crash-on-quit that affected llama.cpp models.", |
| 7 | + "improvements": [ |
| 8 | + { |
| 9 | + "id": "metal-thread-tuning", |
| 10 | + "icon": "gauge.with.dots.needle.67percent", |
| 11 | + "title": "Apple Silicon Thread Optimization", |
| 12 | + "description": "Thread counts are now tuned specifically for Apple Silicon unified memory architecture. When all model layers run on the Metal GPU, CPU threads primarily marshal data - too many threads cause contention and slow things down. Prompt processing uses performance cores; generation uses fewer threads since the GPU handles the heavy compute." |
| 13 | + }, |
| 14 | + { |
| 15 | + "id": "flash-attention-kv-quant", |
| 16 | + "icon": "memorychip", |
| 17 | + "title": "Flash Attention & KV Cache Quantization", |
| 18 | + "description": "Flash attention reduces memory usage by avoiding materializing the full attention matrix. KV cache is quantized to q8_0 (8-bit) for both key and value tensors, cutting cache memory roughly in half compared to f16 with minimal quality impact. Together these allow longer contexts in less memory." |
| 19 | + }, |
| 20 | + { |
| 21 | + "id": "stable-system-prompt", |
| 22 | + "icon": "arrow.triangle.2.circlepath", |
| 23 | + "title": "Stable System Prompt for Cache Reuse", |
| 24 | + "description": "Date and time have been moved from the system prompt into a per-message userContext block, keeping the system prompt identical across messages. For pure Transformer models (Llama, Gemma, Qwen2.5), this enables KV cache prefix reuse - subsequent messages only process new tokens instead of reprocessing the entire prompt." |
| 25 | + }, |
| 26 | + { |
| 27 | + "id": "mlx-eog-detection", |
| 28 | + "icon": "stop.circle", |
| 29 | + "title": "MLX End-of-Generation Detection", |
| 30 | + "description": "MLX streaming now detects end-of-generation patterns (like <end_of_turn> for Gemma models) and stops cleanly instead of streaming hundreds of stop tokens as visible text." |
| 31 | + } |
| 32 | + ], |
| 33 | + "bugfixes": [ |
| 34 | + { |
| 35 | + "id": "metal-crash-on-quit", |
| 36 | + "icon": "arrow.down.circle.fill", |
| 37 | + "title": "Fixed Crash on Quit with Local Models", |
| 38 | + "description": "Fixed a crash caused by llama.cpp's Metal device residual sets not being empty during static destructor cleanup. The app now exits cleanly after saving conversations." |
| 39 | + }, |
| 40 | + { |
| 41 | + "id": "folder-manager-spam", |
| 42 | + "icon": "arrow.down.circle.fill", |
| 43 | + "title": "Fixed Excessive Disk I/O from Folder Manager", |
| 44 | + "description": "The folder manager was being instantiated inside a SwiftUI view body, causing hundreds of redundant disk reads per session. Now cached as a single instance." |
| 45 | + } |
| 46 | + ] |
| 47 | + }, |
3 | 48 | { |
4 | 49 | "version": "20260411.2", |
5 | 50 | "release_date": "April 11, 2026", |
|
0 commit comments