You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
chore: bump version to 20260331.1 and update whats-new
- Bump CFBundleShortVersionString to 20260331.1
- Add ChatML fallback for models missing chat_template in tokenizer_config.json
- Update whats-new.json with release notes for 20260331.1
Copy file name to clipboardExpand all lines: Resources/whats-new.json
+39Lines changed: 39 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -1,5 +1,44 @@
1
1
{
2
2
"releases": [
3
+
{
4
+
"version": "20260331.1",
5
+
"release_date": "March 31, 2026",
6
+
"introduction": "This release brings significant performance and memory improvements to local AI models, along with support for the latest Qwen 3.5 models.",
"description": "Upgraded the local inference engine with up to 35% faster token generation and optimized prompt processing that scales to your hardware. Conversations start faster, especially with longer system prompts."
13
+
},
14
+
{
15
+
"id": "qwen35-support",
16
+
"icon": "cpu.fill",
17
+
"title": "Qwen 3.5 Model Support",
18
+
"description": "SAM now supports Qwen 3.5 models, including the new hybrid attention architecture. Download Qwen 3.5 models directly from SAM's model manager."
19
+
},
20
+
{
21
+
"id": "memory-efficiency",
22
+
"icon": "memorychip.fill",
23
+
"title": "Lower Memory Usage",
24
+
"description": "Local models now release GPU memory immediately when unloaded or when switching between models. Memory estimates for large models are also more accurate, so SAM won't try to load models that don't fit."
25
+
}
26
+
],
27
+
"bugfixes": [
28
+
{
29
+
"id": "duplicate-system-prompt",
30
+
"icon": "doc.on.doc.fill",
31
+
"title": "Fixed Duplicate System Prompt",
32
+
"description": "Fixed an issue where the system prompt could be sent twice to local models, wasting context window space and slowing down responses."
33
+
},
34
+
{
35
+
"id": "missing-chat-template",
36
+
"icon": "text.bubble.fill",
37
+
"title": "Fixed Community Model Compatibility",
38
+
"description": "Some community-quantized models (such as 4-bit variants) were missing configuration needed for conversation formatting. SAM now detects this and applies a compatible format automatically."
/// ChatML fallback template for models missing chat_template in tokenizer_config.json. Covers Qwen, ChatML-family models, and most mlx-community quantizations.
33
+
// swiftlint:disable:next line_length
34
+
staticletdefaultChatMLTemplate="{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -283,8 +287,23 @@ public class AppleMLXAdapter {
283
287
/// DON'T pass tools to applyChatTemplate - causes system prompt bleeding **The problem**: When tools passed to applyChatTemplate: - Some chat templates inject tool definitions directly into prompt - This causes system prompt content to leak into assistant responses - Model gets confused between instructions and conversation **The solution**: Tools in system message content only - MLXProvider handles tool formatting in system message - applyChatTemplate just formats conversation structure - Clean separation: system message = instructions, chat = conversation **VERIFIED**: mlx-swift-examples Chat.swift does NOT pass tools to template Tool definitions should be in system message content (handled by MLXProvider).
284
288
285
289
/// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt Chat template has: {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' %}} {%- endif %}.
/// Some community-quantized models (e.g. mlx-community/*) are missing the chat_template in tokenizer_config.json. Detect this and provide a ChatML fallback so generation doesn't fail.
logger.warning("Model is missing chat_template in tokenizer_config.json, using ChatML fallback")
296
+
}
297
+
298
+
letinputTokens=try tokenizer.applyChatTemplate(
299
+
messages: messages,
300
+
chatTemplate: fallbackTemplate,
301
+
addGenerationPrompt:true,
302
+
truncation:false,
303
+
maxLength:nil,
304
+
tools:nil,
305
+
additionalContext: additionalContext
306
+
)
288
307
logger.debug("Chat template applied with add_generation_prompt=true, tokenized to \(inputTokens.count) tokens")
289
308
290
309
/// Decode the input tokens to see what chat template produced.
@@ -495,8 +514,23 @@ public class AppleMLXAdapter {
495
514
/// DON'T pass tools to applyChatTemplate - causes system prompt bleeding (See streaming path for full documentation of this issue) Tool definitions should be in system message content (handled by MLXProvider) VERIFIED: mlx-swift-examples Chat.swift does NOT pass tools to template.
496
515
497
516
/// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt.
0 commit comments