perf(mlx): upgrade MLX stack and add prefill tuning

fewtarius · fewtarius · commit 0c8fa34a00c5 · 2026-03-31T20:37:55.000-04:00
- Bump mlx-swift 0.30 -&gt; 0.31 (Metal backend improvements)
- Update mlx-swift-lm with Qwen3.5 support and 35% decode speedup
- Bump swift-transformers 1.1 -&gt; 1.2
- Add prefillStepSize to MLXConfiguration for per-profile tuning
- Scale prefill step size by RAM profile (256-2048 tokens)
- Pass prefillStepSize through AppleMLXAdapter to generate()
- Update llama.cpp submodule
diff --git a/Package.resolved b/Package.resolved
diff --git a/Package.swift b/Package.swift
@@ -48,14 +48,14 @@ let package = Package(
     ],
     dependencies: [
         // MLX Swift for Apple Silicon AI acceleration
-        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.30.0"),
+        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.31.1"),
 
         // MLX Swift LM - LLMs and VLMs with MLX Swift (split from mlx-swift-examples)
-        // Pinned to main for Qwen3.5 support (PR #120) - not yet tagged
-        .package(url: "https://github.com/ml-explore/mlx-swift-lm", revision: "b362c8a43fec27e8a11067220091cf522c7ab19c"),
+        // Pinned to main for Qwen3.5 support + 35% gen speed fix (CPU<->GPU sync elimination)
+        .package(url: "https://github.com/ml-explore/mlx-swift-lm", revision: "2a296f145c3129fea4290bb6e4a0a5fb458efa06"),
 
         // Transformers and tokenization support
-        .package(url: "https://github.com/huggingface/swift-transformers", from: "1.1.0"),
+        .package(url: "https://github.com/huggingface/swift-transformers", from: "1.2.0"),
 
 
         // Additional dependencies for HTTP requests and JSON handling
diff --git a/Sources/APIFramework/MLXProvider.swift b/Sources/APIFramework/MLXProvider.swift
@@ -550,6 +550,7 @@ public class MLXProvider: AIProvider {
                         kvGroupSize: mlxConfig.kvGroupSize,
                         quantizedKVStart: mlxConfig.quantizedKVStart,
                         maxKVSize: mlxConfig.maxKVSize,
+                        prefillStepSize: mlxConfig.prefillStepSize,
                         modelId: requestToProcess.model,
                         hideThinking: hideThinking
                     )
diff --git a/Sources/ConfigurationSystem/EndpointConfigurationModels.swift b/Sources/ConfigurationSystem/EndpointConfigurationModels.swift
@@ -100,6 +100,9 @@ public struct MLXConfiguration: Codable, Equatable {
     /// Maximum tokens to generate per response.
     public var maxTokens: Int
 
+    /// Prompt prefill step size (larger = faster prefill, more memory). Default: 512.
+    public var prefillStepSize: Int
+
     public init(
         kvBits: Int? = nil,
         kvGroupSize: Int = 64,
@@ -110,7 +113,8 @@ public struct MLXConfiguration: Codable, Equatable {
         repetitionPenalty: Double? = 1.1,
         repetitionContextSize: Int = 20,
         contextLength: Int = 8192,
-        maxTokens: Int = 2048
+        maxTokens: Int = 2048,
+        prefillStepSize: Int = 512
     ) {
         self.kvBits = kvBits
         self.kvGroupSize = kvGroupSize
@@ -122,6 +126,7 @@ public struct MLXConfiguration: Codable, Equatable {
         self.repetitionContextSize = repetitionContextSize
         self.contextLength = contextLength
         self.maxTokens = maxTokens
+        self.prefillStepSize = prefillStepSize
     }
 
     /// Optimized configuration for memory-constrained systems (8GB RAM) Uses 4-bit KV cache quantization to reduce memory usage by ~75%.
diff --git a/Sources/ConfigurationSystem/SystemCapabilities.swift b/Sources/ConfigurationSystem/SystemCapabilities.swift
@@ -138,7 +138,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
                 repetitionPenalty: 1.1,
                 repetitionContextSize: 20,
                 contextLength: 4096,
-                maxTokens: 1024
+                maxTokens: 1024,
+                prefillStepSize: 256
             )
 
         case .moderate:
@@ -153,7 +154,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
                 repetitionPenalty: 1.1,
                 repetitionContextSize: 20,
                 contextLength: 8192,
-                maxTokens: 1024
+                maxTokens: 1024,
+                prefillStepSize: 512
             )
 
         case .balanced:
@@ -168,7 +170,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
                 repetitionPenalty: 1.1,
                 repetitionContextSize: 20,
                 contextLength: 16384,
-                maxTokens: 2048
+                maxTokens: 2048,
+                prefillStepSize: 1024
             )
 
         case .aggressive:
@@ -183,7 +186,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
                 repetitionPenalty: 1.1,
                 repetitionContextSize: 20,
                 contextLength: 32768,
-                maxTokens: 4096
+                maxTokens: 4096,
+                prefillStepSize: 1024
             )
 
         case .maximum:
@@ -198,7 +202,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
                 repetitionPenalty: 1.1,
                 repetitionContextSize: 20,
                 contextLength: 65536,
-                maxTokens: 8192
+                maxTokens: 8192,
+                prefillStepSize: 2048
             )
         }
     }
diff --git a/Sources/MLXIntegration/AppleMLXAdapter.swift b/Sources/MLXIntegration/AppleMLXAdapter.swift
@@ -270,6 +270,7 @@ public class AppleMLXAdapter {
         kvGroupSize: Int = 64,
         quantizedKVStart: Int = 0,
         maxKVSize: Int? = nil,
+        prefillStepSize: Int = 512,
         modelId: String = "mlx-local",
         hideThinking: Bool = false
     ) -> AsyncThrowingStream<MLXTextChunk, Error> {
@@ -312,7 +313,8 @@ public class AppleMLXAdapter {
                         temperature: temperature,
                         topP: topP,
                         repetitionPenalty: repetitionPenalty,
-                        repetitionContextSize: repetitionContextSize
+                        repetitionContextSize: repetitionContextSize,
+                        prefillStepSize: prefillStepSize
                     )
 
                     /// Accumulate full response for debugging.
diff --git a/external/llama.cpp b/external/llama.cpp
@@ -1 +1 @@
-Subproject commit acb7c790698fa28a0fbfc0468804926815b94de3
+Subproject commit 0fcb3760b2b9a3a496ef14621a7e4dad7a8df90f

Original file line number	Diff line number	Diff line change
`@@ -550,6 +550,7 @@ public class MLXProvider: AIProvider {`
`550`	`550`	`kvGroupSize: mlxConfig.kvGroupSize,`
`551`	`551`	`quantizedKVStart: mlxConfig.quantizedKVStart,`
`552`	`552`	`maxKVSize: mlxConfig.maxKVSize,`
	`553`	`+ prefillStepSize: mlxConfig.prefillStepSize,`
`553`	`554`	`modelId: requestToProcess.model,`
`554`	`555`	`hideThinking: hideThinking`
`555`	`556`	`)`