chore: bump version to 20260331.1 and update whats-new

fewtarius · fewtarius · commit 4c3026980ed8 · 2026-03-31T20:37:55.000-04:00
- Bump CFBundleShortVersionString to 20260331.1
- Add ChatML fallback for models missing chat_template in tokenizer_config.json
- Update whats-new.json with release notes for 20260331.1
diff --git a/Info.plist b/Info.plist
@@ -19,9 +19,9 @@
 	<key>CFBundlePackageType</key>
 	<string>APPL</string>
 	<key>CFBundleShortVersionString</key>
-	<string>20260330.1</string>
+	<string>20260331.1</string>
 	<key>CFBundleVersion</key>
-	<string>20260330.1</string>
+	<string>20260331.1</string>
 	<key>LSApplicationCategoryType</key>
 	<string>public.app-category.productivity</string>
 	<key>LSMinimumSystemVersion</key>
diff --git a/Resources/whats-new.json b/Resources/whats-new.json
@@ -1,5 +1,44 @@
 {
   "releases": [
+    {
+      "version": "20260331.1",
+      "release_date": "March 31, 2026",
+      "introduction": "This release brings significant performance and memory improvements to local AI models, along with support for the latest Qwen 3.5 models.",
+      "improvements": [
+        {
+          "id": "mlx-performance",
+          "icon": "gauge.open.with.lines.needle.33percent.badge.arrow.up",
+          "title": "Faster Local Model Inference",
+          "description": "Upgraded the local inference engine with up to 35% faster token generation and optimized prompt processing that scales to your hardware. Conversations start faster, especially with longer system prompts."
+        },
+        {
+          "id": "qwen35-support",
+          "icon": "cpu.fill",
+          "title": "Qwen 3.5 Model Support",
+          "description": "SAM now supports Qwen 3.5 models, including the new hybrid attention architecture. Download Qwen 3.5 models directly from SAM's model manager."
+        },
+        {
+          "id": "memory-efficiency",
+          "icon": "memorychip.fill",
+          "title": "Lower Memory Usage",
+          "description": "Local models now release GPU memory immediately when unloaded or when switching between models. Memory estimates for large models are also more accurate, so SAM won't try to load models that don't fit."
+        }
+      ],
+      "bugfixes": [
+        {
+          "id": "duplicate-system-prompt",
+          "icon": "doc.on.doc.fill",
+          "title": "Fixed Duplicate System Prompt",
+          "description": "Fixed an issue where the system prompt could be sent twice to local models, wasting context window space and slowing down responses."
+        },
+        {
+          "id": "missing-chat-template",
+          "icon": "text.bubble.fill",
+          "title": "Fixed Community Model Compatibility",
+          "description": "Some community-quantized models (such as 4-bit variants) were missing configuration needed for conversation formatting. SAM now detects this and applies a compatible format automatically."
+        }
+      ]
+    },
     {
       "version": "20260330.1",
       "release_date": "March 30, 2026",
diff --git a/Sources/MLXIntegration/AppleMLXAdapter.swift b/Sources/MLXIntegration/AppleMLXAdapter.swift
@@ -29,6 +29,10 @@ public class AppleMLXAdapter {
     private let logger = Logger(label: "com.sam.mlx.adapter")
     private let typeRegistry = LLMTypeRegistry.shared
 
+    /// ChatML fallback template for models missing chat_template in tokenizer_config.json. Covers Qwen, ChatML-family models, and most mlx-community quantizations.
+    // swiftlint:disable:next line_length
+    static let defaultChatMLTemplate = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+
     /// Cache loaded models to avoid reloading.
     private var loadedModels: [String: any LanguageModel] = [:]
     private var loadedTokenizers: [String: Tokenizer] = [:]
@@ -283,8 +287,23 @@ public class AppleMLXAdapter {
                     /// DON'T pass tools to applyChatTemplate - causes system prompt bleeding **The problem**: When tools passed to applyChatTemplate: - Some chat templates inject tool definitions directly into prompt - This causes system prompt content to leak into assistant responses - Model gets confused between instructions and conversation **The solution**: Tools in system message content only - MLXProvider handles tool formatting in system message - applyChatTemplate just formats conversation structure - Clean separation: system message = instructions, chat = conversation **VERIFIED**: mlx-swift-examples Chat.swift does NOT pass tools to template Tool definitions should be in system message content (handled by MLXProvider).
 
                     /// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt Chat template has: {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' %}} {%- endif %}.
-                    let additionalContext = ["add_generation_prompt": true]
-                    let inputTokens = try tokenizer.applyChatTemplate(messages: messages, tools: nil, additionalContext: additionalContext)
+                    let additionalContext: [String: any Sendable] = ["add_generation_prompt": true]
+
+                    /// Some community-quantized models (e.g. mlx-community/*) are missing the chat_template in tokenizer_config.json. Detect this and provide a ChatML fallback so generation doesn't fail.
+                    let fallbackTemplate: ChatTemplateArgument? = tokenizer.hasChatTemplate ? nil : .literal(Self.defaultChatMLTemplate)
+                    if fallbackTemplate != nil {
+                        logger.warning("Model is missing chat_template in tokenizer_config.json, using ChatML fallback")
+                    }
+
+                    let inputTokens = try tokenizer.applyChatTemplate(
+                        messages: messages,
+                        chatTemplate: fallbackTemplate,
+                        addGenerationPrompt: true,
+                        truncation: false,
+                        maxLength: nil,
+                        tools: nil,
+                        additionalContext: additionalContext
+                    )
                     logger.debug("Chat template applied with add_generation_prompt=true, tokenized to \(inputTokens.count) tokens")
 
                     /// Decode the input tokens to see what chat template produced.
@@ -495,8 +514,23 @@ public class AppleMLXAdapter {
         /// DON'T pass tools to applyChatTemplate - causes system prompt bleeding (See streaming path for full documentation of this issue) Tool definitions should be in system message content (handled by MLXProvider) VERIFIED: mlx-swift-examples Chat.swift does NOT pass tools to template.
 
         /// Pass add_generation_prompt=true to append <|im_start|>assistant\n Without this, Qwen2.5 doesn't know to generate a response and echoes system prompt.
-        let additionalContext = ["add_generation_prompt": true]
-        let inputTokens = try tokenizer.applyChatTemplate(messages: messages, tools: nil, additionalContext: additionalContext)
+        let additionalContext: [String: any Sendable] = ["add_generation_prompt": true]
+
+        /// Some community-quantized models are missing the chat_template in tokenizer_config.json. Detect and provide ChatML fallback.
+        let fallbackTemplate: ChatTemplateArgument? = tokenizer.hasChatTemplate ? nil : .literal(Self.defaultChatMLTemplate)
+        if fallbackTemplate != nil {
+            logger.warning("Model is missing chat_template in tokenizer_config.json, using ChatML fallback")
+        }
+
+        let inputTokens = try tokenizer.applyChatTemplate(
+            messages: messages,
+            chatTemplate: fallbackTemplate,
+            addGenerationPrompt: true,
+            truncation: false,
+            maxLength: nil,
+            tools: nil,
+            additionalContext: additionalContext
+        )
         logger.debug("Chat template applied with add_generation_prompt=true, tokenized to \(inputTokens.count) tokens")
 
         /// Create input.