Skip to content

Commit 0c8fa34

Browse files
committed
perf(mlx): upgrade MLX stack and add prefill tuning
- Bump mlx-swift 0.30 -> 0.31 (Metal backend improvements) - Update mlx-swift-lm with Qwen3.5 support and 35% decode speedup - Bump swift-transformers 1.1 -> 1.2 - Add prefillStepSize to MLXConfiguration for per-profile tuning - Scale prefill step size by RAM profile (256-2048 tokens) - Pass prefillStepSize through AppleMLXAdapter to generate() - Update llama.cpp submodule
1 parent 76dd46e commit 0c8fa34

7 files changed

Lines changed: 49 additions & 18 deletions

File tree

Package.resolved

Lines changed: 24 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Package.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,14 @@ let package = Package(
4848
],
4949
dependencies: [
5050
// MLX Swift for Apple Silicon AI acceleration
51-
.package(url: "https://github.com/ml-explore/mlx-swift", from: "0.30.0"),
51+
.package(url: "https://github.com/ml-explore/mlx-swift", from: "0.31.1"),
5252

5353
// MLX Swift LM - LLMs and VLMs with MLX Swift (split from mlx-swift-examples)
54-
// Pinned to main for Qwen3.5 support (PR #120) - not yet tagged
55-
.package(url: "https://github.com/ml-explore/mlx-swift-lm", revision: "b362c8a43fec27e8a11067220091cf522c7ab19c"),
54+
// Pinned to main for Qwen3.5 support + 35% gen speed fix (CPU<->GPU sync elimination)
55+
.package(url: "https://github.com/ml-explore/mlx-swift-lm", revision: "2a296f145c3129fea4290bb6e4a0a5fb458efa06"),
5656

5757
// Transformers and tokenization support
58-
.package(url: "https://github.com/huggingface/swift-transformers", from: "1.1.0"),
58+
.package(url: "https://github.com/huggingface/swift-transformers", from: "1.2.0"),
5959

6060

6161
// Additional dependencies for HTTP requests and JSON handling

Sources/APIFramework/MLXProvider.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,7 @@ public class MLXProvider: AIProvider {
550550
kvGroupSize: mlxConfig.kvGroupSize,
551551
quantizedKVStart: mlxConfig.quantizedKVStart,
552552
maxKVSize: mlxConfig.maxKVSize,
553+
prefillStepSize: mlxConfig.prefillStepSize,
553554
modelId: requestToProcess.model,
554555
hideThinking: hideThinking
555556
)

Sources/ConfigurationSystem/EndpointConfigurationModels.swift

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ public struct MLXConfiguration: Codable, Equatable {
100100
/// Maximum tokens to generate per response.
101101
public var maxTokens: Int
102102

103+
/// Prompt prefill step size (larger = faster prefill, more memory). Default: 512.
104+
public var prefillStepSize: Int
105+
103106
public init(
104107
kvBits: Int? = nil,
105108
kvGroupSize: Int = 64,
@@ -110,7 +113,8 @@ public struct MLXConfiguration: Codable, Equatable {
110113
repetitionPenalty: Double? = 1.1,
111114
repetitionContextSize: Int = 20,
112115
contextLength: Int = 8192,
113-
maxTokens: Int = 2048
116+
maxTokens: Int = 2048,
117+
prefillStepSize: Int = 512
114118
) {
115119
self.kvBits = kvBits
116120
self.kvGroupSize = kvGroupSize
@@ -122,6 +126,7 @@ public struct MLXConfiguration: Codable, Equatable {
122126
self.repetitionContextSize = repetitionContextSize
123127
self.contextLength = contextLength
124128
self.maxTokens = maxTokens
129+
self.prefillStepSize = prefillStepSize
125130
}
126131

127132
/// Optimized configuration for memory-constrained systems (8GB RAM) Uses 4-bit KV cache quantization to reduce memory usage by ~75%.

Sources/ConfigurationSystem/SystemCapabilities.swift

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
138138
repetitionPenalty: 1.1,
139139
repetitionContextSize: 20,
140140
contextLength: 4096,
141-
maxTokens: 1024
141+
maxTokens: 1024,
142+
prefillStepSize: 256
142143
)
143144

144145
case .moderate:
@@ -153,7 +154,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
153154
repetitionPenalty: 1.1,
154155
repetitionContextSize: 20,
155156
contextLength: 8192,
156-
maxTokens: 1024
157+
maxTokens: 1024,
158+
prefillStepSize: 512
157159
)
158160

159161
case .balanced:
@@ -168,7 +170,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
168170
repetitionPenalty: 1.1,
169171
repetitionContextSize: 20,
170172
contextLength: 16384,
171-
maxTokens: 2048
173+
maxTokens: 2048,
174+
prefillStepSize: 1024
172175
)
173176

174177
case .aggressive:
@@ -183,7 +186,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
183186
repetitionPenalty: 1.1,
184187
repetitionContextSize: 20,
185188
contextLength: 32768,
186-
maxTokens: 4096
189+
maxTokens: 4096,
190+
prefillStepSize: 1024
187191
)
188192

189193
case .maximum:
@@ -198,7 +202,8 @@ public enum RAMProfile: String, Codable, CaseIterable {
198202
repetitionPenalty: 1.1,
199203
repetitionContextSize: 20,
200204
contextLength: 65536,
201-
maxTokens: 8192
205+
maxTokens: 8192,
206+
prefillStepSize: 2048
202207
)
203208
}
204209
}

Sources/MLXIntegration/AppleMLXAdapter.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ public class AppleMLXAdapter {
270270
kvGroupSize: Int = 64,
271271
quantizedKVStart: Int = 0,
272272
maxKVSize: Int? = nil,
273+
prefillStepSize: Int = 512,
273274
modelId: String = "mlx-local",
274275
hideThinking: Bool = false
275276
) -> AsyncThrowingStream<MLXTextChunk, Error> {
@@ -312,7 +313,8 @@ public class AppleMLXAdapter {
312313
temperature: temperature,
313314
topP: topP,
314315
repetitionPenalty: repetitionPenalty,
315-
repetitionContextSize: repetitionContextSize
316+
repetitionContextSize: repetitionContextSize,
317+
prefillStepSize: prefillStepSize
316318
)
317319

318320
/// Accumulate full response for debugging.

external/llama.cpp

Submodule llama.cpp updated 613 files

0 commit comments

Comments
 (0)