bump version to v0.12.2 (#4378)

lvhan028 · web-flow · commit 9a50f1f4eaf1 · 2026-03-18T11:13:15.000+08:00
* bump version to v0.12.2

* rename

* fix typo

* fix

* fix typo in llm_compressor.md

* use logger.exception

* logger.debug num_outputs

* warning to info

* remove role checker

* update doc
diff --git a/.github/workflows/cuda12.8_whl_release.yml b/.github/workflows/cuda12.8_whl_release.yml
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
diff --git a/.github/workflows/windows_x64_gpu.yml b/.github/workflows/windows_x64_gpu.yml
diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
   <li>Qwen3-VL (2B - 235B)</li>
-  <li>Qwen3.5</li>
+  <li>Qwen3.5 (0.8B - 397B)</li>
   <li>DeepSeek-VL (7B)</li>
   <li>DeepSeek-VL2 (3B, 16B, 27B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
@@ -228,7 +228,7 @@ The default prebuilt package is compiled on **CUDA 12** since v0.3.0.
 For the GeForce RTX 50 series, please install the LMDeploy prebuilt package complied with **CUDA 12.8**
 
 ```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
 export PYTHON_VERSION=310
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128
 ```
diff --git a/README_ja.md b/README_ja.md
@@ -155,7 +155,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
   <li>Qwen3-VL (2B - 235B)</li>
-  <li>Qwen3.5</li>
+  <li>Qwen3.5 (0.8B - 397B)</li>
   <li>DeepSeek-VL (7B)</li>
   <li>DeepSeek-VL2 (3B, 16B, 27B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -178,7 +178,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
   <li>Qwen3-VL (2B - 235B)</li>
-  <li>Qwen3.5</li>
+  <li>Qwen3.5 (0.8B - 397B)</li>
   <li>DeepSeek-VL (7B)</li>
   <li>DeepSeek-VL2 (3B, 16B, 27B)</li>
   <li>InternVL-Chat (v1.1-v1.5)</li>
@@ -230,7 +230,7 @@ pip install lmdeploy
 若使用 GeForce RTX 50 系列显卡，请安装基于 **CUDA 12.8** 编译的 LMDeploy 预编译包。
 
 ```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
 export PYTHON_VERSION=310
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu128-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu128
 ```
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 The default prebuilt package is compiled on **CUDA 12**. If CUDA 11+ (>=11.3) is required, you can install lmdeploy by:
 
 ```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
 export PYTHON_VERSION=310
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/en/quantization/llm_compressor.md b/docs/en/quantization/llm_compressor.md
@@ -44,7 +44,7 @@ conda create -n lmdeploy python=3.10 -y
 conda activate lmdeploy
 
 # Install llm-compressor
-pip install llm-compressor
+pip install llmcompressor
 
 # Clone lmdeploy source code and run the quantization example
 git clone https://github.com/InternLM/lmdeploy
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -25,6 +25,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |            Qwen2-MoE             |     57BA14B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |     Qwen2.5<sup>\[2\]</sup>      |    0.5B - 72B    | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |              Qwen3               |    0.6B-235B     | LLM  |    Yes    |   Yes   |  Yes\*  | Yes\* |
+|     Qwen3.5<sup>\[3\]</sup>      |    0.8B-397B     | MLLM |    Yes    |   Yes   |   No    |  Yes  |
 |     Mistral<sup>\[1\]</sup>      |        7B        | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |             Mixtral              |   8x7B, 8x22B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-V2            |    16B, 236B     | LLM  |    Yes    |   Yes   |   Yes   |  No   |
@@ -54,6 +55,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 ```{note}
 * [1] The TurboMind engine doesn't support window attention. Therefore, for models that have applied window attention and have the corresponding switch "use_sliding_window" enabled, such as Mistral, Qwen1.5 and etc., please choose the PyTorch engine for inference.
 * [2] When the head_dim of a model is not 128, such as llama3.2-1B, qwen2-0.5B and internvl2-1B, turbomind doesn't support its kv cache 4/8 bit quantization and inference
+* [3] TurboMind does not currently support the vision encoder for the Qwen3.5 series.
 ```
 
 ## PyTorchEngine on CUDA Platform
@@ -89,7 +91,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |            QWen2-VL            |     2B, 7B      | MLLM |    Yes    |   Yes   |   No    |  No  |  Yes  |
 |           QWen2.5-VL           |    3B - 72B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
 |            QWen3-VL            |    2B - 235B    | MLLM |    Yes    |   No    |   No    |  No  |  No   |
-|            QWen3.5             |    27B-397B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
+|            QWen3.5             |    0.8B-397B    | MLLM |    Yes    |   No    |   No    |  No  |  No   |
 |          DeepSeek-MoE          |       16B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |          DeepSeek-V2           |    16B, 236B    | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |         DeepSeek-V2.5          |      236B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
@@ -23,7 +23,7 @@ pip install lmdeploy
 默认的预构建包是在 **CUDA 12** 上编译的。如果需要 CUDA 11+ (>=11.3)，你可以使用以下命令安装 lmdeploy：
 
 ```shell
-export LMDEPLOY_VERSION=0.12.1
+export LMDEPLOY_VERSION=0.12.2
 export PYTHON_VERSION=310
 pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
diff --git a/docs/zh_cn/quantization/llm_compressor.md b/docs/zh_cn/quantization/llm_compressor.md
@@ -42,7 +42,7 @@ conda create -n lmdeploy python=3.10 -y
 conda activate lmdeploy
 
 # 安装 llm-compressor
-pip install llm-compressor
+pip install llmcompressor
 
 # 下载 lmdeploy 源码，运行量化用用例
 git clone https://github.com/InternLM/lmdeploy
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -24,8 +24,9 @@
 |     Qwen1.5<sup>\[1\]</sup>      |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      Qwen2<sup>\[2\]</sup>       |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |            Qwen2-MoE             |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|              Qwen3               |   0.6B-235B    | LLM  |    Yes    |   Yes   |  Yes\*  |  Yes  |
 |     Qwen2.5<sup>\[2\]</sup>      |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|              Qwen3               |   0.6B-235B    | LLM  |    Yes    |   Yes   |  Yes\*  |  Yes  |
+|     Qwen3.5<sup>\[3\]</sup>      |   0.8B-397B    | LLM  |    Yes    |   Yes   |   No    |  Yes  |
 |     Mistral<sup>\[1\]</sup>      |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |             Mixtral              |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-V2            |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
@@ -55,6 +56,7 @@
 ```{note}
 * [1] turbomind 引擎不支持 window attention。所以，对于应用了 window attention，并开启了对应的开关"use_sliding_window"的模型，比如 Mistral、Qwen1.5 等，在推理时，请选择 pytorch engine
 * [2] 当模型的 head_dim 非 128 时，turbomind 不支持它的 kv cache 4/8 bit 量化和推理。比如，llama3.2-1B，qwen2-0.5B，internvl2-1B 等等
+* [3] turbomind 目前暂不支持 Qwen3.5 系列的视觉编码器。
 ```
 
 ## PyTorchEngine CUDA 平台
@@ -85,7 +87,7 @@
 |             QWen2              |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |            Qwen2.5             |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |             Qwen3              |   0.6B - 235B   | LLM  |    Yes    |   Yes   |  Yes\*  |  -   |  Yes  |
-|            QWen3.5             |    27B-397B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
+|            QWen3.5             |    0.8B-397B    | MLLM |    Yes    |   No    |   No    |  No  |  No   |
 |           QWen3-Next           |       80B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |            QWen2-VL            |     2B, 7B      | MLLM |    Yes    |   Yes   |   No    |  No  |  Yes  |
 |           QWen2.5-VL           |    3B - 72B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
@@ -268,7 +268,7 @@ async def safe_run(self, handle, session, **kwargs):
             metrics_processor.increase_api_routed_requests()
             yield generator
         except (Exception, asyncio.CancelledError, GeneratorExit) as e:  # noqa
-            logger.error(f'[safe_run] session {session.session_id} exception caught: {type(e).__name__} {e}')
+            logger.exception(f'[safe_run] session {session.session_id} exception caught: {e}')
             await session.async_abort()
             if self.backend == 'pytorch':
                 await handle.async_end(session.session_id)
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
@@ -99,8 +99,6 @@ async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]:
         def _inner_call(i, in_messages, out_messages):
             role = in_messages[i]['role']
             content = in_messages[i]['content']
-            assert role in ['system', 'user', 'assistant'], \
-                f'unsupported role "{role}"'
             if role != 'user' or isinstance(content, str):
                 # the content is a user's prompt or an assistant's prompt,
                 # returning it directly
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.12.1'
+__version__ = '0.12.2'
 short_version = __version__
 
 
diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc
@@ -528,7 +528,7 @@ auto SequenceManager::Materialize(Sequences             sequences,
 
     // release preempted blocks -> cached
     if (!schedule.victims.empty()) {
-        TM_LOG_WARNING("[SeqMgr] #victim: %d", (int)schedule.victims.size());
+        TM_LOG_INFO("[SeqMgr] #victim: %d", (int)schedule.victims.size());
         for (const auto& p : schedule.victims) {
             UpdateAndSetUnlock(*p);
         }