Skip to content

Commit c7067fc

Browse files
authored
feat: Build pre-processor from GGUF (NVIDIA#344)
This lets us do: ``` dynamo-run out=llamacpp <gguf_file> ``` Previously a `--model-config <hf-repo>` was also required, to configure our tokenizer.
1 parent d29f7fc commit c7067fc

25 files changed

Lines changed: 2321 additions & 186 deletions

File tree

Cargo.lock

Lines changed: 353 additions & 36 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/guides/dynamo_run.md

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,7 @@ dynamo-run in=none out=sglang --model-path ~/llm_models/DeepSeek-R1-Distill-Llam
177177

178178
- `cargo build --features llamacpp,cuda`
179179

180-
- `dynamo-run out=llama_cpp --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/`
181-
182-
The extra `--model-config` flag is because:
183-
- llama_cpp only runs GGUF
184-
- We send it tokens, meaning we do the tokenization ourself, so we need a tokenizer
185-
- We don't yet read it out of the GGUF (TODO), so we need an HF repo with `tokenizer.json` et al
180+
- `dynamo-run out=llama_cp ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf`
186181

187182
If the build step also builds llama_cpp libraries into the same folder as the binary ("libllama.so", "libggml.so", "libggml-base.so", "libggml-cpu.so", "libggml-cuda.so"), then `dynamo-run` will need to find those at runtime. Set `LD_LIBRARY_PATH`, and be sure to deploy them alongside the `dynamo-run` binary.
188183

@@ -215,7 +210,7 @@ Run (still inside that virtualenv) - HF repo:
215210

216211
Run (still inside that virtualenv) - GGUF:
217212
```
218-
./dynamo-run in=http out=vllm --model-path ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf --model-config ~/llm_models/Llama-3.2-3B-Instruct/
213+
./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
219214
```
220215

221216
+ Multi-node:

launch/dynamo-run/src/input/common.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ pub async fn prepare_engine(
7474
let preprocessor = OpenAIPreprocessor::new(*card.clone())
7575
.await?
7676
.into_operator();
77-
let backend = Backend::from_mdc(*card.clone()).await?.into_operator();
77+
let backend = Backend::from_tokenizer(card.tokenizer_hf()?)
78+
.await?
79+
.into_operator();
7880
let engine = ServiceBackend::from_engine(inner_engine);
7981

8082
let pipeline = frontend

launch/dynamo-run/src/lib.rs

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -126,46 +126,49 @@ pub async fn run(
126126
// Load the model deployment card, if any
127127
// Only used by some engines, so without those feature flags it's unused.
128128
#[allow(unused_variables)]
129-
let (maybe_card_path, maybe_card) = match (&model_path, &flags.model_config) {
129+
let maybe_card = match (&model_path, &flags.model_config) {
130130
// --model-config takes precedence
131131
(_, Some(model_config)) => {
132-
let card =
133-
match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref())
134-
.await
135-
{
136-
Ok(card) => Some(card),
137-
Err(e) => {
138-
tracing::error!(
139-
"Failed to load model card from config path {}: {}",
140-
model_config.display(),
141-
e
142-
);
143-
None
144-
}
145-
};
146-
(Some(model_config.clone()), card)
132+
match ModelDeploymentCard::from_local_path(model_config, model_name.as_deref()).await {
133+
Ok(card) => Some(card),
134+
Err(e) => {
135+
tracing::error!(
136+
"Failed to load model card from --model-config path {}: {e}",
137+
model_config.display(),
138+
);
139+
None
140+
}
141+
}
147142
}
148143
// If --model-path is an HF repo use that
149144
(Some(model_path), _) if model_path.is_dir() => {
150-
let card = match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref())
151-
.await
152-
{
145+
match ModelDeploymentCard::from_local_path(model_path, model_name.as_deref()).await {
153146
Ok(card) => Some(card),
154147
Err(e) => {
155148
tracing::error!(
156-
"Failed to load model card from model path {}: {}",
149+
"Failed to load model card from --model-path {}: {e}",
157150
model_path.display(),
158-
e
159151
);
160152
None
161153
}
162-
};
163-
(Some(model_path.clone()), card)
154+
}
155+
}
156+
(Some(model_path), _) if model_path.is_file() => {
157+
match ModelDeploymentCard::from_gguf(model_path, model_name.as_deref()).await {
158+
Ok(card) => Some(card),
159+
Err(e) => {
160+
tracing::error!(
161+
"Failed to load model card from GGUF {}: {e}",
162+
model_path.display(),
163+
);
164+
None
165+
}
166+
}
164167
}
165168
// Otherwise we don't have one, but we only need it if we're tokenizing
166169
_ => {
167170
tracing::debug!("No model card path provided (neither --model-config nor a directory in --model-path)");
168-
(None, None)
171+
None
169172
}
170173
};
171174

@@ -276,17 +279,9 @@ pub async fn run(
276279
"out=vllm requires flag --model-path=<full-path-to-hf-repo-or-model-gguf>"
277280
);
278281
};
279-
let Some(card_path) = maybe_card_path else {
280-
// If we have a gguf we also need a model card because we don't currently parse
281-
// tokenizer et al out of gguf.
282-
anyhow::bail!(
283-
"Running GGUF files also requires a `--model-config` for the tokenizer et al."
284-
);
285-
};
286282
let Some(card) = maybe_card.clone() else {
287283
anyhow::bail!(
288-
"Failed to load model card: either unsupported HuggingFace repo format \
289-
or for GGUF files --model-config is missing."
284+
"Unable to build tokenizer. out=vllm requires --model-path to be an HF repo with fast tokenizer (tokenizer.json) or a GGUF file"
290285
);
291286
};
292287
let Some(sock_prefix) = zmq_socket_prefix else {
@@ -311,7 +306,6 @@ pub async fn run(
311306
// vllm multi-node only the leader runs vllm
312307
let (engine, vllm_future) = vllm::make_leader_engine(
313308
cancel_token.clone(),
314-
&card_path,
315309
&model_path,
316310
&sock_prefix,
317311
node_conf,

launch/dynamo-run/src/main.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,6 @@ fn main() -> anyhow::Result<()> {
8080
let Some(model_path) = flags.model_path_flag else {
8181
anyhow::bail!("vllm subprocess requires --model-path flag");
8282
};
83-
let Some(model_config) = flags.model_config else {
84-
anyhow::bail!("vllm subprocess requires --model-config");
85-
};
86-
if !model_config.is_dir() {
87-
anyhow::bail!("vllm subprocess requires model config path to be a directory containing tokenizer.json, config.json, etc");
88-
}
8983
if cfg!(feature = "vllm") {
9084
#[cfg(feature = "vllm")]
9185
{
@@ -97,7 +91,6 @@ fn main() -> anyhow::Result<()> {
9791
};
9892
return vllm::run_subprocess(
9993
ZMQ_SOCKET_PREFIX,
100-
&model_config,
10194
&model_path,
10295
node_config,
10396
flags.tensor_parallel_size,

0 commit comments

Comments
 (0)