Skip to content

Commit 24a1710

Browse files
committed
fix: update context size precedence to prioritize backend configuration over model configuration
1 parent 9a742ad commit 24a1710

7 files changed

Lines changed: 88 additions & 29 deletions

File tree

pkg/inference/backends/llamacpp/llamacpp_config.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,16 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
9595
}
9696

9797
func GetContextSize(modelCfg types.ModelConfig, backendCfg *inference.BackendConfiguration) *int32 {
98-
// Model config takes precedence
98+
// Backend config takes precedence (runtime configuration via docker model configure / Ollama API num_ctx)
99+
if backendCfg != nil && backendCfg.ContextSize != nil && (*backendCfg.ContextSize == UnlimitedContextSize || *backendCfg.ContextSize > 0) {
100+
return backendCfg.ContextSize
101+
}
102+
// Fallback to model config (set at packaging time via docker model package --context-size)
99103
if modelCfg != nil {
100104
if ctxSize := modelCfg.GetContextSize(); ctxSize != nil && (*ctxSize == UnlimitedContextSize || *ctxSize > 0) {
101105
return ctxSize
102106
}
103107
}
104-
// Fallback to backend config
105-
if backendCfg != nil && backendCfg.ContextSize != nil && (*backendCfg.ContextSize == UnlimitedContextSize || *backendCfg.ContextSize > 0) {
106-
return backendCfg.ContextSize
107-
}
108108
return nil
109109
}
110110

pkg/inference/backends/llamacpp/llamacpp_config_test.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ func TestGetArgs(t *testing.T) {
191191
),
192192
},
193193
{
194-
name: "context size from model config",
194+
name: "backend config takes precedence over model config",
195195
mode: inference.BackendModeEmbedding,
196196
bundle: &fakeBundle{
197197
ggufPath: modelPath,
@@ -206,7 +206,25 @@ func TestGetArgs(t *testing.T) {
206206
"--model", modelPath,
207207
"--host", socket,
208208
"--embeddings",
209-
"--ctx-size", "2096", // model config takes precedence
209+
"--ctx-size", "1234", // backend config takes precedence
210+
"--jinja",
211+
),
212+
},
213+
{
214+
name: "model config used when no backend config",
215+
mode: inference.BackendModeEmbedding,
216+
bundle: &fakeBundle{
217+
ggufPath: modelPath,
218+
config: &types.Config{
219+
ContextSize: int32ptr(2096),
220+
},
221+
},
222+
config: nil,
223+
expected: append(slices.Clone(baseArgs),
224+
"--model", modelPath,
225+
"--host", socket,
226+
"--embeddings",
227+
"--ctx-size", "2096", // model config used as fallback
210228
"--jinja",
211229
),
212230
},

pkg/inference/backends/mlx/mlx_config.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
6161
return args, nil
6262
}
6363

64-
// GetMaxTokens returns the max tokens (context size) from model config or backend config.
65-
// Model config takes precedence over backend config.
64+
// GetMaxTokens returns the max tokens (context size) from backend config or model config.
65+
// Backend config takes precedence over model config (runtime configuration).
6666
// Returns nil if neither is specified (MLX will use model defaults).
6767
func GetMaxTokens(modelCfg types.ModelConfig, backendCfg *inference.BackendConfiguration) *uint64 {
6868
return nil

pkg/inference/backends/sglang/sglang_config.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,18 +63,18 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
6363
return args, nil
6464
}
6565

66-
// GetContextLength returns the context length (context size) from model config or backend config.
67-
// Model config takes precedence over backend config.
66+
// GetContextLength returns the context length (context size) from backend config or model config.
67+
// Backend config takes precedence over model config (runtime configuration).
6868
// Returns nil if neither is specified (SGLang will auto-derive from model).
6969
func GetContextLength(modelCfg types.ModelConfig, backendCfg *inference.BackendConfiguration) *int32 {
70-
// Model config takes precedence
71-
if cs := modelCfg.GetContextSize(); cs != nil && *cs > 0 {
72-
return cs
73-
}
74-
// Fallback to backend config
70+
// Backend config takes precedence (runtime configuration via docker model configure / Ollama API num_ctx)
7571
if backendCfg != nil && backendCfg.ContextSize != nil && *backendCfg.ContextSize > 0 {
7672
return backendCfg.ContextSize
7773
}
74+
// Fallback to model config (set at packaging time via docker model package --context-size)
75+
if cs := modelCfg.GetContextSize(); cs != nil && *cs > 0 {
76+
return cs
77+
}
7878
// Return nil to let SGLang auto-derive from model config
7979
return nil
8080
}

pkg/inference/backends/sglang/sglang_config_test.go

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func TestGetArgs(t *testing.T) {
103103
},
104104
},
105105
{
106-
name: "with model context size (takes precedence)",
106+
name: "backend config takes precedence over model config",
107107
bundle: &mockModelBundle{
108108
safetensorsPath: "/path/to/model/model.safetensors",
109109
runtimeConfig: &types.Config{
@@ -114,6 +114,29 @@ func TestGetArgs(t *testing.T) {
114114
config: &inference.BackendConfiguration{
115115
ContextSize: int32ptr(8192),
116116
},
117+
expected: []string{
118+
"-m",
119+
"sglang.launch_server",
120+
"--model-path",
121+
"/path/to/model",
122+
"--host",
123+
"127.0.0.1",
124+
"--port",
125+
"30000",
126+
"--context-length",
127+
"8192",
128+
},
129+
},
130+
{
131+
name: "model config used when no backend config",
132+
bundle: &mockModelBundle{
133+
safetensorsPath: "/path/to/model/model.safetensors",
134+
runtimeConfig: &types.Config{
135+
ContextSize: int32ptr(16384),
136+
},
137+
},
138+
mode: inference.BackendModeCompletion,
139+
config: nil,
117140
expected: []string{
118141
"-m",
119142
"sglang.launch_server",
@@ -225,14 +248,14 @@ func TestGetContextLength(t *testing.T) {
225248
expectedValue: int32ptr(8192),
226249
},
227250
{
228-
name: "model config takes precedence",
251+
name: "backend config takes precedence",
229252
modelCfg: &types.Config{
230253
ContextSize: int32ptr(16384),
231254
},
232255
backendCfg: &inference.BackendConfiguration{
233256
ContextSize: int32ptr(4096),
234257
},
235-
expectedValue: int32ptr(16384),
258+
expectedValue: int32ptr(4096),
236259
},
237260
{
238261
name: "zero context size in backend config returns nil",

pkg/inference/backends/vllm/vllm_config.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,20 +87,20 @@ func (c *Config) GetArgs(bundle types.ModelBundle, socket string, mode inference
8787
return args, nil
8888
}
8989

90-
// GetMaxModelLen returns the max model length (context size) from model config or backend config.
91-
// Model config takes precedence over backend config.
90+
// GetMaxModelLen returns the max model length (context size) from backend config or model config.
91+
// Backend config takes precedence over model config (runtime configuration).
9292
// Returns nil if neither is specified (vLLM will auto-derive from model).
9393
func GetMaxModelLen(modelCfg types.ModelConfig, backendCfg *inference.BackendConfiguration) *int32 {
94-
// Model config takes precedence
94+
// Backend config takes precedence (runtime configuration via docker model configure / Ollama API num_ctx)
95+
if backendCfg != nil && backendCfg.ContextSize != nil && *backendCfg.ContextSize > 0 {
96+
return backendCfg.ContextSize
97+
}
98+
// Fallback to model config (set at packaging time via docker model package --context-size)
9599
if modelCfg != nil {
96100
if ctxSize := modelCfg.GetContextSize(); ctxSize != nil {
97101
return ctxSize
98102
}
99103
}
100-
// Fallback to backend config
101-
if backendCfg != nil && backendCfg.ContextSize != nil && *backendCfg.ContextSize > 0 {
102-
return backendCfg.ContextSize
103-
}
104104
// Return nil to let vLLM auto-derive from model config
105105
return nil
106106
}

pkg/inference/backends/vllm/vllm_config_test.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func TestGetArgs(t *testing.T) {
109109
},
110110
},
111111
{
112-
name: "with model context size (takes precedence)",
112+
name: "backend config takes precedence over model config",
113113
bundle: &mockModelBundle{
114114
safetensorsPath: "/path/to/model",
115115
runtimeConfig: &types.Config{
@@ -119,6 +119,24 @@ func TestGetArgs(t *testing.T) {
119119
config: &inference.BackendConfiguration{
120120
ContextSize: int32ptr(8192),
121121
},
122+
expected: []string{
123+
"serve",
124+
"/path/to",
125+
"--uds",
126+
"/tmp/socket",
127+
"--max-model-len",
128+
"8192",
129+
},
130+
},
131+
{
132+
name: "model config used when no backend config",
133+
bundle: &mockModelBundle{
134+
safetensorsPath: "/path/to/model",
135+
runtimeConfig: &types.Config{
136+
ContextSize: int32ptr(16384),
137+
},
138+
},
139+
config: nil,
122140
expected: []string{
123141
"serve",
124142
"/path/to",
@@ -458,14 +476,14 @@ func TestGetMaxModelLen(t *testing.T) {
458476
expectedValue: int32ptr(8192),
459477
},
460478
{
461-
name: "model config takes precedence",
479+
name: "backend config takes precedence",
462480
modelCfg: &types.Config{
463481
ContextSize: int32ptr(16384),
464482
},
465483
backendCfg: &inference.BackendConfiguration{
466484
ContextSize: int32ptr(4096),
467485
},
468-
expectedValue: int32ptr(16384),
486+
expectedValue: int32ptr(4096),
469487
},
470488
}
471489

0 commit comments

Comments
 (0)