Skip to content

Commit 670661f

Browse files
authored
feat: Allow passing any arguments to vllm and sglang engines (NVIDIA#368)
Put the arguments in a JSON file: ``` { "dtype": "half", "trust_remote_code": true } ``` Pass it like this: ``` dynamo-run out=sglang ~/llm_models/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json ``` Requested here ai-dynamo/dynamo#290 (`dtype`) and here ai-dynamo/dynamo#360 (`trust_remote_code`).
1 parent a03dd47 commit 670661f

15 files changed

Lines changed: 280 additions & 113 deletions

File tree

docs/guides/dynamo_run.md

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,16 @@ Any example above using `out=sglang` will work, but our sglang backend is also m
165165

166166
Node 1:
167167
```
168-
dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
168+
dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --leader-addr 10.217.98.122:9876
169169
```
170170

171171
Node 2:
172172
```
173-
dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
173+
dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --leader-addr 10.217.98.122:9876
174174
```
175175

176+
To pass extra arguments to the sglang engine see *Extra engine arguments* below.
177+
176178
## llama_cpp
177179

178180
- `cargo build --features llamacpp,cuda`
@@ -225,6 +227,8 @@ Node 2:
225227
dynamo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
226228
```
227229

230+
To pass extra arguments to the vllm engine see *Extra engine arguments* below.
231+
228232
## Python bring-your-own-engine
229233

230234
You can provide your own engine in a Python file. The file must provide a generator with this signature:
@@ -434,3 +438,20 @@ The output looks like this:
434438

435439
The input defaults to `in=text`. The output will default to `mistralrs` engine. If not available whatever engine you have compiled in (so depending on `--features`).
436440

441+
## Extra engine arguments
442+
443+
The vllm and sglang backends support passing any argument the engine accepts.
444+
445+
Put the arguments in a JSON file:
446+
```
447+
{
448+
"dtype": "half",
449+
"trust_remote_code": true
450+
}
451+
```
452+
453+
Pass it like this:
454+
```
455+
dynamo-run out=sglang ~/llm_models/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
456+
```
457+

launch/dynamo-run/src/flags.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515

16+
use std::collections::HashMap;
1617
use std::path::PathBuf;
1718
use std::str::FromStr;
1819

@@ -106,6 +107,11 @@ pub struct Flags {
106107
#[arg(long, hide = true, value_parser = parse_sglang_flags)]
107108
pub internal_sglang_process: Option<SgLangFlags>,
108109

110+
/// Additional engine-specific arguments from a JSON file.
111+
/// Contains a mapping of parameter names to values.
112+
#[arg(long)]
113+
pub extra_engine_args: Option<PathBuf>,
114+
109115
/// Everything after a `--`.
110116
/// These are the command line arguments to the python engine when using `pystr` or `pytok`.
111117
#[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
@@ -146,9 +152,27 @@ impl Flags {
146152
out.push("--leader-addr".to_string());
147153
out.push(leader.to_string());
148154
}
155+
if let Some(extra_engine_args) = self.extra_engine_args.as_ref() {
156+
out.push("--extra-engine-args".to_string());
157+
out.push(extra_engine_args.display().to_string());
158+
}
149159
out.extend(self.last.clone());
150160
out
151161
}
162+
163+
/// Load extra engine arguments from a JSON file
164+
/// Returns a HashMap of parameter names to values
165+
pub fn load_extra_engine_args(
166+
&self,
167+
) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
168+
if let Some(path) = &self.extra_engine_args {
169+
let file_content = std::fs::read_to_string(path)?;
170+
let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
171+
Ok(Some(args))
172+
} else {
173+
Ok(None)
174+
}
175+
}
152176
}
153177

154178
#[derive(Debug, Clone, Copy)]

launch/dynamo-run/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ pub async fn run(
257257
node_conf,
258258
flags.tensor_parallel_size,
259259
flags.base_gpu_id,
260+
flags.extra_engine_args,
260261
)
261262
.await?;
262263
extra = Some(Box::pin(async move {
@@ -310,6 +311,7 @@ pub async fn run(
310311
&sock_prefix,
311312
node_conf,
312313
flags.tensor_parallel_size,
314+
flags.extra_engine_args,
313315
)
314316
.await?;
315317
extra = Some(Box::pin(async move {

launch/dynamo-run/src/main.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Example:
3232

3333
const ZMQ_SOCKET_PREFIX: &str = "dyn";
3434

35-
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core|pystr:<engine.py>|pytok:<engine.py>] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
35+
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core|pystr:<engine.py>|pytok:<engine.py>] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json]";
3636

3737
fn main() -> anyhow::Result<()> {
3838
logging::init();
@@ -68,6 +68,7 @@ fn main() -> anyhow::Result<()> {
6868
sglang_flags.pipe_fd as std::os::fd::RawFd,
6969
node_config,
7070
gpu_config,
71+
flags.extra_engine_args,
7172
);
7273
}
7374
} else {
@@ -94,6 +95,7 @@ fn main() -> anyhow::Result<()> {
9495
&model_path,
9596
node_config,
9697
flags.tensor_parallel_size,
98+
flags.extra_engine_args,
9799
);
98100
}
99101
} else {

lib/llm/src/engines/sglang.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515

16-
use std::path::Path;
16+
use std::path::{Path, PathBuf};
1717
use std::sync::Arc;
1818

1919
use crate::backend::ExecutionContext;
@@ -40,6 +40,8 @@ pub async fn make_engine(
4040
tensor_parallel_size: u32,
4141
// The base GPU ID to start allocating GPUs from
4242
base_gpu_id: u32,
43+
// Extra arguments to pass directly as sglang ServerArgs
44+
extra_engine_args: Option<PathBuf>,
4345
) -> pipeline_error::Result<(ExecutionContext, tokio::task::JoinHandle<()>)> {
4446
let mut engine = SgLangEngine::new(
4547
cancel_token,
@@ -48,6 +50,7 @@ pub async fn make_engine(
4850
node_conf,
4951
tensor_parallel_size,
5052
base_gpu_id,
53+
extra_engine_args,
5154
)
5255
.await?;
5356
let sglang_process = engine.take_sglang_worker_handle();

lib/llm/src/engines/sglang/engine.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// See the License for the specific language governing permissions and
1414
// limitations under the License.
1515

16-
use std::path::Path;
16+
use std::path::{Path, PathBuf};
1717

1818
use async_stream::stream;
1919
use async_trait::async_trait;
@@ -39,6 +39,7 @@ impl SgLangEngine {
3939
node_conf: MultiNodeConfig,
4040
tensor_parallel_size: u32,
4141
base_gpu_id: u32,
42+
extra_engine_args: Option<PathBuf>,
4243
) -> anyhow::Result<Self> {
4344
let w = super::worker::start(
4445
cancel_token.clone(),
@@ -47,6 +48,7 @@ impl SgLangEngine {
4748
node_conf,
4849
tensor_parallel_size,
4950
base_gpu_id,
51+
extra_engine_args,
5052
)
5153
.await?;
5254
let engine = SgLangEngine {
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
#
18+
# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
19+
#
20+
21+
import json
22+
import logging
23+
import tempfile
24+
from multiprocessing.connection import Connection
25+
26+
from sglang.srt.entrypoints.engine import _set_envs_and_config
27+
from sglang.srt.managers.scheduler import run_scheduler_process
28+
from sglang.srt.server_args import PortArgs, ServerArgs
29+
30+
logging.basicConfig(
31+
level="DEBUG",
32+
force=True,
33+
datefmt="%Y-%m-%d %H:%M:%S",
34+
format="[%(asctime)s] %(message)s",
35+
)
36+
37+
# These can all be overridden by --extra-engine-args json file
38+
arg_map = {
39+
"model_path": f"{model_path}",
40+
"enable_metrics": False,
41+
"log_level": "debug",
42+
"log_requests": True,
43+
"tp_size": int(tp_size_str),
44+
# Multi-node
45+
"dist_init_addr": dist_init_addr if dist_init_addr != "" else None,
46+
"nnodes": int(nnodes_str),
47+
"node_rank": int(node_rank_str),
48+
}
49+
json_map = {}
50+
if extra_engine_args != "":
51+
# extra_engine_args is a filename
52+
try:
53+
with open(extra_engine_args) as f:
54+
json_map = json.load(f)
55+
except FileNotFoundError:
56+
logging.debug(f"File {extra_engine_args} not found.")
57+
except json.JSONDecodeError as e:
58+
logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
59+
logging.debug(f"Adding extra engine arguments: {json_map}")
60+
arg_map = {**arg_map, **json_map} # json_map gets precedence
61+
62+
server_args = ServerArgs(**arg_map)
63+
_set_envs_and_config(server_args)
64+
logging.debug(server_args)
65+
66+
ipc_path = f"ipc:///tmp/{socket_id}"
67+
# These must match worker.rs zmq_sockets, which is the other side
68+
port_args = PortArgs(
69+
# we don't use this one so use anything
70+
tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
71+
# Us -> sglang
72+
scheduler_input_ipc_name=f"{ipc_path}_input_socket",
73+
# sglang -> us
74+
detokenizer_ipc_name=f"{ipc_path}_output_socket",
75+
# The port for nccl initialization (torch.dist), which we don't use
76+
nccl_port=9876,
77+
)
78+
79+
# Rank must be globally unique across nodes
80+
tp_rank = int(tp_rank_str)
81+
82+
# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
83+
# In a single-node setup this is the same as rank
84+
gpu_id = int(gpu_id_str)
85+
86+
pipe_fd_int = int(pipe_fd)
87+
writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
88+
89+
run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)

lib/llm/src/engines/sglang/subprocess.rs

Lines changed: 14 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -14,68 +14,16 @@
1414
// limitations under the License.
1515

1616
use pyo3::{types::IntoPyDict, Python};
17-
use std::{env, os::fd::RawFd, path::Path};
17+
use std::{
18+
env,
19+
ffi::CString,
20+
os::fd::RawFd,
21+
path::{Path, PathBuf},
22+
};
1823

1924
use crate::engines::MultiNodeConfig;
2025

21-
const PY_START_ENGINE: &std::ffi::CStr = cr#"
22-
from multiprocessing.connection import Connection
23-
import signal
24-
import tempfile
25-
import logging
26-
27-
from sglang.srt.server_args import ServerArgs, PortArgs
28-
import sglang as sgl
29-
from sglang.srt.managers.scheduler import run_scheduler_process
30-
from sglang.srt.entrypoints.engine import _set_envs_and_config
31-
32-
33-
server_args = ServerArgs(
34-
model_path=f"{model_path}",
35-
enable_metrics = False,
36-
log_level = "debug",
37-
log_requests = True,
38-
tp_size = int(tp_size_str),
39-
# Multi-node
40-
dist_init_addr = dist_init_addr if dist_init_addr != "" else None,
41-
nnodes = int(nnodes_str),
42-
node_rank = int(node_rank_str),
43-
)
44-
logging.basicConfig(
45-
level="DEBUG",
46-
force=True,
47-
datefmt="%Y-%m-%d %H:%M:%S",
48-
format=f"[%(asctime)s] %(message)s",
49-
)
50-
_set_envs_and_config(server_args)
51-
52-
logging.debug(server_args)
53-
54-
ipc_path = f"ipc:///tmp/{socket_id}";
55-
# These must match worker.rs zmq_sockets, which is the other side
56-
port_args = PortArgs(
57-
# we don't use this one so use anything
58-
tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
59-
# Us -> sglang
60-
scheduler_input_ipc_name=f"{ipc_path}_input_socket",
61-
# sglang -> us
62-
detokenizer_ipc_name=f"{ipc_path}_output_socket",
63-
# The port for nccl initialization (torch.dist), which we don't use
64-
nccl_port=9876,
65-
)
66-
67-
# Rank must be globally unique across nodes
68-
tp_rank = int(tp_rank_str)
69-
70-
# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
71-
# In a single-node setup this is the same as rank
72-
gpu_id = int(gpu_id_str)
73-
74-
pipe_fd_int = int(pipe_fd)
75-
writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
76-
77-
run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)
78-
"#;
26+
const PY_START_ENGINE: &str = include_str!("sglang_inc.py");
7927

8028
/// Start the Python sglang engine that listens on zmq socket
8129
/// This is called by running `nio --internal-sglang-process
@@ -91,12 +39,17 @@ pub fn run_subprocess(
9139
node_config: MultiNodeConfig,
9240
// Multi GPU. Usually Default::default
9341
gpu_config: super::MultiGPUConfig,
42+
// Allow passing any arguments to sglang
43+
extra_engine_args: Option<PathBuf>,
9444
) -> anyhow::Result<()> {
9545
pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
9646
if let Ok(venv) = env::var("VIRTUAL_ENV") {
9747
let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
9848
}
9949
let dir = model_path.display().to_string();
50+
let extra_engine_args_str = &extra_engine_args
51+
.map(|p| p.display().to_string())
52+
.unwrap_or_default();
10053
Python::with_gil(|py| {
10154
let locals = [
10255
("socket_id", socket_id),
@@ -109,10 +62,11 @@ pub fn run_subprocess(
10962
("nnodes_str", &node_config.num_nodes.to_string()),
11063
("node_rank_str", &node_config.node_rank.to_string()),
11164
("dist_init_addr", &node_config.leader_addr),
65+
("extra_engine_args", extra_engine_args_str),
11266
]
11367
.into_py_dict(py)
11468
.unwrap();
115-
if let Err(err) = py.run(PY_START_ENGINE, None, Some(&locals)) {
69+
if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
11670
anyhow::bail!("sglang engine run error: {err}");
11771
}
11872
tracing::info!("sglang subprocess exit");

0 commit comments

Comments
 (0)