buildvoc
diff --git a/‎docs/guides/dynamo_run.md‎
Lines changed: 23 additions & 2 deletions b/‎docs/guides/dynamo_run.md‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 24 additions & 0 deletions b/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 2 additions & 0 deletions b/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎launch/dynamo-run/src/main.rs‎
Lines changed: 3 additions & 1 deletion b/‎launch/dynamo-run/src/main.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lib/llm/src/engines/sglang.rs‎
Lines changed: 4 additions & 1 deletion b/‎lib/llm/src/engines/sglang.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/llm/src/engines/sglang/engine.rs‎
Lines changed: 3 additions & 1 deletion b/‎lib/llm/src/engines/sglang/engine.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎lib/llm/src/engines/sglang/sglang_inc.py‎
Lines changed: 89 additions & 0 deletions b/‎lib/llm/src/engines/sglang/sglang_inc.py‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎lib/llm/src/engines/sglang/subprocess.rs‎
Lines changed: 14 additions & 60 deletions b/‎lib/llm/src/engines/sglang/subprocess.rs‎
Lines changed: 14 additions & 60 deletions
@@ -165,14 +165,16 @@ Any example above using `out=sglang` will work, but our sglang backend is also m
 
 Node 1:
 ```
-dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --dist-init-addr 10.217.98.122:9876
+dynamo-run in=http out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 0 --leader-addr 10.217.98.122:9876
 ```
 
 Node 2:
 ```
-dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --dist-init-addr 10.217.98.122:9876
+dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llama-70B/ --tensor-parallel-size 8 --num-nodes 2 --node-rank 1 --leader-addr 10.217.98.122:9876
 ```
 
+To pass extra arguments to the sglang engine see *Extra engine arguments* below.
+
 ## llama_cpp
 
 - `cargo build --features llamacpp,cuda`
@@ -225,6 +227,8 @@ Node 2:
 dynamo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
 ```
 
+To pass extra arguments to the vllm engine see *Extra engine arguments* below.
+
 ## Python bring-your-own-engine
 
 You can provide your own engine in a Python file. The file must provide a generator with this signature:
@@ -434,3 +438,20 @@ The output looks like this:
 
 The input defaults to `in=text`. The output will default to `mistralrs` engine. If not available whatever engine you have compiled in (so depending on `--features`).
 
+## Extra engine arguments
+
+The vllm and sglang backends support passing any argument the engine accepts.
+
+Put the arguments in a JSON file:
+```
+{
+    "dtype": "half",
+    "trust_remote_code": true
+}
+```
+
+Pass it like this:
+```
+dynamo-run out=sglang ~/llm_models/Llama-3.2-3B-Instruct --extra-engine-args sglang_extra.json
+```
+
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::path::PathBuf;
 use std::str::FromStr;
 
@@ -106,6 +107,11 @@ pub struct Flags {
     #[arg(long, hide = true, value_parser = parse_sglang_flags)]
     pub internal_sglang_process: Option<SgLangFlags>,
 
+    /// Additional engine-specific arguments from a JSON file.
+    /// Contains a mapping of parameter names to values.
+    #[arg(long)]
+    pub extra_engine_args: Option<PathBuf>,
+
     /// Everything after a `--`.
     /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
     #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
@@ -146,9 +152,27 @@ impl Flags {
             out.push("--leader-addr".to_string());
             out.push(leader.to_string());
         }
+        if let Some(extra_engine_args) = self.extra_engine_args.as_ref() {
+            out.push("--extra-engine-args".to_string());
+            out.push(extra_engine_args.display().to_string());
+        }
         out.extend(self.last.clone());
         out
     }
+
+    /// Load extra engine arguments from a JSON file
+    /// Returns a HashMap of parameter names to values
+    pub fn load_extra_engine_args(
+        &self,
+    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
+        if let Some(path) = &self.extra_engine_args {
+            let file_content = std::fs::read_to_string(path)?;
+            let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
+            Ok(Some(args))
+        } else {
+            Ok(None)
+        }
+    }
 }
 
 #[derive(Debug, Clone, Copy)]
 
@@ -257,6 +257,7 @@ pub async fn run(
                 node_conf,
                 flags.tensor_parallel_size,
                 flags.base_gpu_id,
+                flags.extra_engine_args,
             )
             .await?;
             extra = Some(Box::pin(async move {
@@ -310,6 +311,7 @@ pub async fn run(
                     &sock_prefix,
                     node_conf,
                     flags.tensor_parallel_size,
+                    flags.extra_engine_args,
                 )
                 .await?;
                 extra = Some(Box::pin(async move {
 
@@ -32,7 +32,7 @@ Example:
 
 const ZMQ_SOCKET_PREFIX: &str = "dyn";
 
-const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core|pystr:<engine.py>|pytok:<engine.py>] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0]";
+const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>|none] out=[mistralrs|sglang|llamacpp|vllm|trtllm|echo_full|echo_core|pystr:<engine.py>|pytok:<engine.py>] [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json]";
 
 fn main() -> anyhow::Result<()> {
     logging::init();
@@ -68,6 +68,7 @@ fn main() -> anyhow::Result<()> {
                         sglang_flags.pipe_fd as std::os::fd::RawFd,
                         node_config,
                         gpu_config,
+                        flags.extra_engine_args,
                     );
                 }
             } else {
@@ -94,6 +95,7 @@ fn main() -> anyhow::Result<()> {
                         &model_path,
                         node_config,
                         flags.tensor_parallel_size,
+                        flags.extra_engine_args,
                     );
                 }
             } else {
 
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 
 use crate::backend::ExecutionContext;
@@ -40,6 +40,8 @@ pub async fn make_engine(
     tensor_parallel_size: u32,
     // The base GPU ID to start allocating GPUs from
     base_gpu_id: u32,
+    // Extra arguments to pass directly as sglang ServerArgs
+    extra_engine_args: Option<PathBuf>,
 ) -> pipeline_error::Result<(ExecutionContext, tokio::task::JoinHandle<()>)> {
     let mut engine = SgLangEngine::new(
         cancel_token,
@@ -48,6 +50,7 @@ pub async fn make_engine(
         node_conf,
         tensor_parallel_size,
         base_gpu_id,
+        extra_engine_args,
     )
     .await?;
     let sglang_process = engine.take_sglang_worker_handle();
 
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::path::Path;
+use std::path::{Path, PathBuf};
 
 use async_stream::stream;
 use async_trait::async_trait;
@@ -39,6 +39,7 @@ impl SgLangEngine {
         node_conf: MultiNodeConfig,
         tensor_parallel_size: u32,
         base_gpu_id: u32,
+        extra_engine_args: Option<PathBuf>,
     ) -> anyhow::Result<Self> {
         let w = super::worker::start(
             cancel_token.clone(),
@@ -47,6 +48,7 @@ impl SgLangEngine {
             node_conf,
             tensor_parallel_size,
             base_gpu_id,
+            extra_engine_args,
         )
         .await?;
         let engine = SgLangEngine {
 
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This file is included as a string in subprocess.rs. Most work should be done in the Rust caller.
+#
+
+import json
+import logging
+import tempfile
+from multiprocessing.connection import Connection
+
+from sglang.srt.entrypoints.engine import _set_envs_and_config
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.server_args import PortArgs, ServerArgs
+
+logging.basicConfig(
+    level="DEBUG",
+    force=True,
+    datefmt="%Y-%m-%d %H:%M:%S",
+    format="[%(asctime)s] %(message)s",
+)
+
+# These can all be overridden by --extra-engine-args json file
+arg_map = {
+    "model_path": f"{model_path}",
+    "enable_metrics": False,
+    "log_level": "debug",
+    "log_requests": True,
+    "tp_size": int(tp_size_str),
+    # Multi-node
+    "dist_init_addr": dist_init_addr if dist_init_addr != "" else None,
+    "nnodes": int(nnodes_str),
+    "node_rank": int(node_rank_str),
+}
+json_map = {}
+if extra_engine_args != "":
+    # extra_engine_args is a filename
+    try:
+        with open(extra_engine_args) as f:
+            json_map = json.load(f)
+    except FileNotFoundError:
+        logging.debug(f"File {extra_engine_args} not found.")
+    except json.JSONDecodeError as e:
+        logging.debug(f"Invalid JSON in {extra_engine_args}: {e}")
+    logging.debug(f"Adding extra engine arguments: {json_map}")
+    arg_map = {**arg_map, **json_map}  # json_map gets precedence
+
+server_args = ServerArgs(**arg_map)
+_set_envs_and_config(server_args)
+logging.debug(server_args)
+
+ipc_path = f"ipc:///tmp/{socket_id}"
+# These must match worker.rs zmq_sockets, which is the other side
+port_args = PortArgs(
+    # we don't use this one so use anything
+    tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+    # Us -> sglang
+    scheduler_input_ipc_name=f"{ipc_path}_input_socket",
+    # sglang -> us
+    detokenizer_ipc_name=f"{ipc_path}_output_socket",
+    # The port for nccl initialization (torch.dist), which we don't use
+    nccl_port=9876,
+)
+
+# Rank must be globally unique across nodes
+tp_rank = int(tp_rank_str)
+
+# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
+# In a single-node setup this is the same as rank
+gpu_id = int(gpu_id_str)
+
+pipe_fd_int = int(pipe_fd)
+writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
+
+run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)
@@ -14,68 +14,16 @@
 // limitations under the License.
 
 use pyo3::{types::IntoPyDict, Python};
-use std::{env, os::fd::RawFd, path::Path};
+use std::{
+    env,
+    ffi::CString,
+    os::fd::RawFd,
+    path::{Path, PathBuf},
+};
 
 use crate::engines::MultiNodeConfig;
 
-const PY_START_ENGINE: &std::ffi::CStr = cr#"
-from multiprocessing.connection import Connection
-import signal
-import tempfile
-import logging
-
-from sglang.srt.server_args import ServerArgs, PortArgs
-import sglang as sgl
-from sglang.srt.managers.scheduler import run_scheduler_process
-from sglang.srt.entrypoints.engine import _set_envs_and_config
-
-
-server_args = ServerArgs(
-    model_path=f"{model_path}",
-    enable_metrics = False,
-    log_level = "debug",
-    log_requests = True,
-    tp_size = int(tp_size_str),
-    # Multi-node
-    dist_init_addr = dist_init_addr if dist_init_addr != "" else None,
-    nnodes = int(nnodes_str),
-    node_rank = int(node_rank_str),
-)
-logging.basicConfig(
-    level="DEBUG",
-    force=True,
-    datefmt="%Y-%m-%d %H:%M:%S",
-    format=f"[%(asctime)s] %(message)s",
-)
-_set_envs_and_config(server_args)
-
-logging.debug(server_args)
-
-ipc_path = f"ipc:///tmp/{socket_id}";
-# These must match worker.rs zmq_sockets, which is the other side
-port_args = PortArgs(
-    # we don't use this one so use anything
-    tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
-    # Us -> sglang
-    scheduler_input_ipc_name=f"{ipc_path}_input_socket",
-    # sglang -> us
-    detokenizer_ipc_name=f"{ipc_path}_output_socket",
-    # The port for nccl initialization (torch.dist), which we don't use
-    nccl_port=9876,
-)
-
-# Rank must be globally unique across nodes
-tp_rank = int(tp_rank_str)
-
-# See nvidia-smi for GPU IDs, they run 0,1,2,etc.
-# In a single-node setup this is the same as rank
-gpu_id = int(gpu_id_str)
-
-pipe_fd_int = int(pipe_fd)
-writer = Connection(handle=pipe_fd_int, readable=False, writable=True)
-
-run_scheduler_process(server_args, port_args, gpu_id, tp_rank, None, writer)
-"#;
+const PY_START_ENGINE: &str = include_str!("sglang_inc.py");
 
 /// Start the Python sglang engine that listens on zmq socket
 /// This is called by running `nio --internal-sglang-process
@@ -91,12 +39,17 @@ pub fn run_subprocess(
     node_config: MultiNodeConfig,
     // Multi GPU. Usually Default::default
     gpu_config: super::MultiGPUConfig,
+    // Allow passing any arguments to sglang
+    extra_engine_args: Option<PathBuf>,
 ) -> anyhow::Result<()> {
     pyo3::prepare_freethreaded_python(); // or enable feature "auto-initialize"
     if let Ok(venv) = env::var("VIRTUAL_ENV") {
         let _ = Python::with_gil(|py| crate::engines::fix_venv(venv, py));
     }
     let dir = model_path.display().to_string();
+    let extra_engine_args_str = &extra_engine_args
+        .map(|p| p.display().to_string())
+        .unwrap_or_default();
     Python::with_gil(|py| {
         let locals = [
             ("socket_id", socket_id),
@@ -109,10 +62,11 @@ pub fn run_subprocess(
             ("nnodes_str", &node_config.num_nodes.to_string()),
             ("node_rank_str", &node_config.node_rank.to_string()),
             ("dist_init_addr", &node_config.leader_addr),
+            ("extra_engine_args", extra_engine_args_str),
         ]
         .into_py_dict(py)
         .unwrap();
-        if let Err(err) = py.run(PY_START_ENGINE, None, Some(&locals)) {
+        if let Err(err) = py.run(CString::new(PY_START_ENGINE)?.as_ref(), None, Some(&locals)) {
             anyhow::bail!("sglang engine run error: {err}");
         }
         tracing::info!("sglang subprocess exit");