Skip to content

Commit 4c30269

Browse files
committed
chore: bump version to 20260331.1 and update whats-new
- Bump CFBundleShortVersionString to 20260331.1 - Add ChatML fallback for models missing chat_template in tokenizer_config.json - Update whats-new.json with release notes for 20260331.1
1 parent 0c8fa34 commit 4c30269

3 files changed

Lines changed: 79 additions & 6 deletions

File tree

Info.plist

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
<key>CFBundlePackageType</key>
2020
<string>APPL</string>
2121
<key>CFBundleShortVersionString</key>
22-
<string>20260330.1</string>
22+
<string>20260331.1</string>
2323
<key>CFBundleVersion</key>
24-
<string>20260330.1</string>
24+
<string>20260331.1</string>
2525
<key>LSApplicationCategoryType</key>
2626
<string>public.app-category.productivity</string>
2727
<key>LSMinimumSystemVersion</key>

Resources/whats-new.json

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,44 @@
11
{
22
"releases": [
3+
{
4+
"version": "20260331.1",
5+
"release_date": "March 31, 2026",
6+
"introduction": "This release brings significant performance and memory improvements to local AI models, along with support for the latest Qwen 3.5 models.",
7+
"improvements": [
8+
{
9+
"id": "mlx-performance",
10+
"icon": "gauge.open.with.lines.needle.33percent.badge.arrow.up",
11+
"title": "Faster Local Model Inference",
12+
"description": "Upgraded the local inference engine with up to 35% faster token generation and optimized prompt processing that scales to your hardware. Conversations start faster, especially with longer system prompts."
13+
},
14+
{
15+
"id": "qwen35-support",
16+
"icon": "cpu.fill",
17+
"title": "Qwen 3.5 Model Support",
18+
"description": "SAM now supports Qwen 3.5 models, including the new hybrid attention architecture. Download Qwen 3.5 models directly from SAM's model manager."
19+
},
20+
{
21+
"id": "memory-efficiency",
22+
"icon": "memorychip.fill",
23+
"title": "Lower Memory Usage",
24+
"description": "Local models now release GPU memory immediately when unloaded or when switching between models. Memory estimates for large models are also more accurate, so SAM won't try to load models that don't fit."
25+
}
26+
],
27+
"bugfixes": [
28+
{
29+
"id": "duplicate-system-prompt",
30+
"icon": "doc.on.doc.fill",
31+
"title": "Fixed Duplicate System Prompt",
32+
"description": "Fixed an issue where the system prompt could be sent twice to local models, wasting context window space and slowing down responses."
33+
},
34+
{
35+
"id": "missing-chat-template",
36+
"icon": "text.bubble.fill",
37+
"title": "Fixed Community Model Compatibility",
38+
"description": "Some community-quantized models (such as 4-bit variants) were missing configuration needed for conversation formatting. SAM now detects this and applies a compatible format automatically."
39+
}
40+
]
41+
},
342
{
443
"version": "20260330.1",
544
"release_date": "March 30, 2026",

Sources/MLXIntegration/AppleMLXAdapter.swift

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ public class AppleMLXAdapter {
2929
private let logger = Logger(label: "com.sam.mlx.adapter")
3030
private let typeRegistry = LLMTypeRegistry.shared
3131

32+
/// ChatML fallback template for models missing chat_template in tokenizer_config.json. Covers Qwen, ChatML-family models, and most mlx-community quantizations.
33+
// swiftlint:disable:next line_length
34+
static let defaultChatMLTemplate = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
35+
3236
/// Cache loaded models to avoid reloading.
3337
private var loadedModels: [String: any LanguageModel] = [:]
3438
private var loadedTokenizers: [String: Tokenizer] = [:]
@@ -283,8 +287,23 @@ public class AppleMLXAdapter {
283287
/// DON'T pass tools to applyChatTemplate - causes system prompt bleeding **The problem**: When tools passed to applyChatTemplate: - Some chat templates inject tool definitions directly into prompt - This causes system prompt content to leak into assistant responses - Model gets confused between instructions and conversation **The solution**: Tools in system message content only - MLXProvider handles tool formatting in system message - applyChatTemplate just formats conversation structure - Clean separation: system message = instructions, chat = conversation **VERIFIED**: mlx-swift-examples Chat.swift does NOT pass tools to template Tool definitions should be in system message content (handled by MLXProvider).
284288

285289
/// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt Chat template has: {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' %}} {%- endif %}.
286-
let additionalContext = ["add_generation_prompt": true]
287-
let inputTokens = try tokenizer.applyChatTemplate(messages: messages, tools: nil, additionalContext: additionalContext)
290+
let additionalContext: [String: any Sendable] = ["add_generation_prompt": true]
291+
292+
/// Some community-quantized models (e.g. mlx-community/*) are missing the chat_template in tokenizer_config.json. Detect this and provide a ChatML fallback so generation doesn't fail.
293+
let fallbackTemplate: ChatTemplateArgument? = tokenizer.hasChatTemplate ? nil : .literal(Self.defaultChatMLTemplate)
294+
if fallbackTemplate != nil {
295+
logger.warning("Model is missing chat_template in tokenizer_config.json, using ChatML fallback")
296+
}
297+
298+
let inputTokens = try tokenizer.applyChatTemplate(
299+
messages: messages,
300+
chatTemplate: fallbackTemplate,
301+
addGenerationPrompt: true,
302+
truncation: false,
303+
maxLength: nil,
304+
tools: nil,
305+
additionalContext: additionalContext
306+
)
288307
logger.debug("Chat template applied with add_generation_prompt=true, tokenized to \(inputTokens.count) tokens")
289308

290309
/// Decode the input tokens to see what chat template produced.
@@ -495,8 +514,23 @@ public class AppleMLXAdapter {
495514
/// DON'T pass tools to applyChatTemplate - causes system prompt bleeding (See streaming path for full documentation of this issue) Tool definitions should be in system message content (handled by MLXProvider) VERIFIED: mlx-swift-examples Chat.swift does NOT pass tools to template.
496515

497516
/// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt.
498-
let additionalContext = ["add_generation_prompt": true]
499-
let inputTokens = try tokenizer.applyChatTemplate(messages: messages, tools: nil, additionalContext: additionalContext)
517+
let additionalContext: [String: any Sendable] = ["add_generation_prompt": true]
518+
519+
/// Some community-quantized models are missing the chat_template in tokenizer_config.json. Detect and provide ChatML fallback.
520+
let fallbackTemplate: ChatTemplateArgument? = tokenizer.hasChatTemplate ? nil : .literal(Self.defaultChatMLTemplate)
521+
if fallbackTemplate != nil {
522+
logger.warning("Model is missing chat_template in tokenizer_config.json, using ChatML fallback")
523+
}
524+
525+
let inputTokens = try tokenizer.applyChatTemplate(
526+
messages: messages,
527+
chatTemplate: fallbackTemplate,
528+
addGenerationPrompt: true,
529+
truncation: false,
530+
maxLength: nil,
531+
tools: nil,
532+
additionalContext: additionalContext
533+
)
500534
logger.debug("Chat template applied with add_generation_prompt=true, tokenized to \(inputTokens.count) tokens")
501535

502536
/// Create input.

0 commit comments

Comments
 (0)