buildvoc
diff --git a/‎Cargo.lock‎
Lines changed: 353 additions & 36 deletions b/‎Cargo.lock‎
Lines changed: 353 additions & 36 deletions
diff --git a/‎docs/guides/dynamo_run.md‎
Lines changed: 2 additions & 7 deletions b/‎docs/guides/dynamo_run.md‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎launch/dynamo-run/src/input/common.rs‎
Lines changed: 3 additions & 1 deletion b/‎launch/dynamo-run/src/input/common.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 28 additions & 34 deletions b/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 28 additions & 34 deletions
diff --git a/‎launch/dynamo-run/src/main.rs‎
Lines changed: 0 additions & 7 deletions b/‎launch/dynamo-run/src/main.rs‎
Lines changed: 0 additions & 7 deletions
@@ -177,12 +177,7 @@ dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llam
 
 - `cargo build --features llamacpp,cuda`
 
-- `dynamo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
-
-The extra `--model-config` flag is because:
-- llama_cpp only runs GGUF
-- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
-- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
+- `dynamo-run out=llama_cp ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf`
 
 If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
 
@@ -215,7 +210,7 @@ Run (still inside that virtualenv) - HF repo:
 
 Run (still inside that virtualenv) - GGUF:
 ```
-./dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
+./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
 ```
 
 + Multi-node:
 
@@ -74,7 +74,9 @@ pub async fn prepare_engine(
             let preprocessor = OpenAIPreprocessor::new(*card.clone())
                 .await?
                 .into_operator();
-            let backend = Backend::from_mdc(*card.clone()).await?.into_operator();
+            let backend = Backend::from_tokenizer(card.tokenizer_hf()?)
+                .await?
+                .into_operator();
             let engine = ServiceBackend::from_engine(inner_engine);
 
             let pipeline = frontend
 
@@ -126,46 +126,49 @@ pub async fn run(
     // Load the model deployment card, if any
     // Only used by some engines, so without those feature flags it's unused.
     #[allow(unused_variables)]
-    let (maybe_card_path, maybe_card) = match (&model_path, &flags.model_config) {
+    let maybe_card = match (&model_path, &flags.model_config) {
         // --model-config takes precedence
         (_, Some(model_config)) => {
-            let card =
-                match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref())
-                    .await
-                {
-                    Ok(card) => Some(card),
-                    Err(e) => {
-                        tracing::error!(
-                            "Failed to load model card from config path {}: {}",
-                            model_config.display(),
-                            e
-                        );
-                        None
-                    }
-                };
-            (Some(model_config.clone()), card)
+            match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref()).await {
+                Ok(card) => Some(card),
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to load model card from --model-config path {}: {e}",
+                        model_config.display(),
+                    );
+                    None
+                }
+            }
         }
         // If --model-path is an HF repo use that
         (Some(model_path), _) if model_path.is_dir() => {
-            let card = match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref())
-                .await
-            {
+            match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref()).await {
                 Ok(card) => Some(card),
                 Err(e) => {
                     tracing::error!(
-                        "Failed to load model card from model path {}: {}",
+                        "Failed to load model card from --model-path {}: {e}",
                         model_path.display(),
-                        e
                     );
                     None
                 }
-            };
-            (Some(model_path.clone()), card)
+            }
+        }
+        (Some(model_path), _) if model_path.is_file() => {
+            match ModelDeploymentCard::from_gguf(model_path, model_name.as_deref()).await {
+                Ok(card) => Some(card),
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to load model card from GGUF {}: {e}",
+                        model_path.display(),
+                    );
+                    None
+                }
+            }
         }
         // Otherwise we don't have one, but we only need it if we're tokenizing
         _ => {
             tracing::debug!("No model card path provided (neither --model-config nor a directory in --model-path)");
-            (None, None)
+            None
         }
     };
 
@@ -276,17 +279,9 @@ pub async fn run(
                     "out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
                 );
             };
-            let Some(card_path) = maybe_card_path else {
-                // If we have a gguf we also need a model card because we don't currently parse
-                // tokenizer et al out of gguf.
-                anyhow::bail!(
-                    "Running GGUF files also requires a `--model-config` for the tokenizer et al."
-                );
-            };
             let Some(card) = maybe_card.clone() else {
                 anyhow::bail!(
-                    "Failed to load model card: either unsupported HuggingFace repo format \
-                    or for GGUF files --model-config is missing."
+                    "Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
                 );
             };
             let Some(sock_prefix) = zmq_socket_prefix else {
@@ -311,7 +306,6 @@ pub async fn run(
                 // vllm multi-node only the leader runs vllm
                 let (engine, vllm_future) = vllm::make_leader_engine(
                     cancel_token.clone(),
-                    &card_path,
                     &model_path,
                     &sock_prefix,
                     node_conf,
 
@@ -80,12 +80,6 @@ fn main() -> anyhow::Result<()> {
             let Some(model_path) = flags.model_path_flag else {
                 anyhow::bail!("vllm subprocess requires --model-path flag");
             };
-            let Some(model_config) = flags.model_config else {
-                anyhow::bail!("vllm subprocess requires --model-config");
-            };
-            if !model_config.is_dir() {
-                anyhow::bail!("vllm subprocess requires model config path to be a directory containing tokenizer.json, config.json, etc");
-            }
             if cfg!(feature = "vllm") {
                 #[cfg(feature = "vllm")]
                 {
@@ -97,7 +91,6 @@ fn main() -> anyhow::Result<()> {
                     };
                     return vllm::run_subprocess(
                         ZMQ_SOCKET_PREFIX,
-                        &model_config,
                         &model_path,
                         node_config,
                         flags.tensor_parallel_size,