[Refactor] Replace skip_mm_profiling with deploy_modality=text to skip mm profiling (#7088)

kevincheng2 · claude · web-flow · commit 054add238a27 · 2026-03-30T20:18:23.000-07:00
原 `--skip-mm-profiling` 参数与已有的 `deploy_modality` 参数功能存在语义重叠：
当以纯文本模式（`deploy_modality=text`）部署时，本就不需要为多模态 token 预留显存。
引入独立参数增加了配置复杂度，复用 `deploy_modality` 更加直观和一致。

- `fastdeploy/engine/args_utils.py`：删除 `EngineArgs.skip_mm_profiling` 字段及
  `--skip-mm-profiling` 启动参数
- `fastdeploy/config.py`：删除 `ModelConfig.__init__` 中的 `self.skip_mm_profiling = False`；
  `FDConfig.get_max_chunk_tokens` 中将条件改为
  `self.deploy_modality != DeployModality.TEXT`，
  当 deploy_modality 为 text 时直接返回 `max_num_batched_tokens`，跳过 mm token 叠加

```bash
python -m fastdeploy.entrypoints.openai.api_server \
  --deploy-modality text \
  --model /path/to/model \
  ...
```

- [x] Add at least a tag in the PR title.
- [x] Format your code, run `pre-commit` before commit.
- [ ] Add unit tests. 本次为参数重构，逻辑等价替换，已有 config 单元测试覆盖。

Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1215,6 +1215,37 @@ def update_enable_early_stop(self, argument: bool):
             argument = self.enable_early_stop
 
 
+class DeployModality(str, Enum):
+    """Modality mode for the serving engine deployment.
+
+    Determines which input modalities the serving engine should handle:
+      - TEXT:  Text-only deployment. The engine only processes text inputs,
+               skipping multimodal preprocessing (e.g., vision encoder, audio
+               encoder). This reduces GPU memory usage and startup time when
+               multimodal capabilities are not needed.
+      - MIXED: Multimodal deployment (default). The engine handles mixed-modality
+               inputs including text, images, audio, and video. All modality-specific
+               encoders and preprocessing pipelines are initialized at startup.
+
+    Usage:
+      --deploy-modality text    # text-only, lower resource footprint
+      --deploy-modality mixed   # full multimodal support (default)
+    """
+
+    TEXT = "text"
+    MIXED = "mixed"
+
+    @classmethod
+    def from_str(cls, value: str) -> "DeployModality":
+        """Parse a string into a DeployModality enum, with validation."""
+        value = value.strip().lower()
+        try:
+            return cls(value)
+        except ValueError:
+            valid = ", ".join(f"'{m.value}'" for m in cls)
+            raise ValueError(f"Invalid deploy_modality '{value}'. Must be one of: {valid}")
+
+
 class LoadChoices(str, Enum):
     """LoadChoices"""
 
@@ -1697,6 +1728,7 @@ def __init__(
         tool_parser: str = None,
         test_mode=False,
         routing_replay_config: Optional[RoutingReplayConfig] = None,
+        deploy_modality: "DeployModality" = None,
     ):
         self.model_config: ModelConfig = model_config  # type: ignore
         self.cache_config: CacheConfig = cache_config  # type: ignore
@@ -1713,8 +1745,7 @@ def __init__(
         self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
         self.router_config: RouterConfig = router_config
         self.routing_replay_config = routing_replay_config
-
-        # Initialize cuda graph capture list
+        self.deploy_modality: DeployModality = deploy_modality if deploy_modality is not None else DeployModality.MIXED
         max_capture_shape = self.scheduler_config.max_num_seqs
         if self.speculative_config is not None and self.speculative_config.method in ["mtp", "suffix"]:
             max_capture_shape = self.scheduler_config.max_num_seqs * (
@@ -2209,7 +2240,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
                 num_tokens = self.scheduler_config.max_num_seqs
         else:
             num_tokens = self.scheduler_config.max_num_batched_tokens
-            if mm_max_tokens_per_item is not None:
+            if mm_max_tokens_per_item is not None and self.deploy_modality != DeployModality.TEXT:
                 max_mm_tokens = max(
                     mm_max_tokens_per_item.get("image", 0),
                     mm_max_tokens_per_item.get("video", 0),