Skip to content

Commit 6c38cf9

Browse files
committed
Add tests for hybrid prefix cache config plumbing
1 parent 0397f35 commit 6c38cf9

2 files changed

Lines changed: 132 additions & 0 deletions

File tree

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import sys
2+
from types import ModuleType, SimpleNamespace
3+
4+
from lmdeploy.cli import serve as serve_module
5+
from lmdeploy.messages import TurbomindEngineConfig
6+
7+
8+
def _make_api_server_args(**overrides):
9+
args = dict(model_path='QuantTrio/Qwen3.5-27B-AWQ',
10+
model_name='qwen35-awq',
11+
backend='turbomind',
12+
dtype='auto',
13+
tp=2,
14+
dp=1,
15+
ep=1,
16+
cp=1,
17+
nnodes=1,
18+
node_rank=0,
19+
dist_init_addr=None,
20+
max_batch_size=None,
21+
session_len=4096,
22+
model_format='awq',
23+
quant_policy=8,
24+
rope_scaling_factor=0.0,
25+
cache_max_entry_count=0.8,
26+
cache_block_seq_len=64,
27+
enable_prefix_caching=True,
28+
linear_prefix_cache_interval_blocks=4,
29+
max_prefill_token_num=8192,
30+
num_tokens_per_iter=0,
31+
max_prefill_iters=1,
32+
async_=1,
33+
communicator='nccl',
34+
disable_metrics=False,
35+
adapters=None,
36+
device='cuda',
37+
eager_mode=False,
38+
disable_vision_encoder=False,
39+
logprobs_mode='raw_logits',
40+
dllm_block_length=64,
41+
dllm_unmasking_strategy='low_confidence_dynamic',
42+
dllm_denoising_steps=0,
43+
dllm_confidence_threshold=0.0,
44+
enable_return_routed_experts=False,
45+
distributed_executor_backend=None,
46+
chat_template=None,
47+
vision_max_batch_size=1,
48+
server_name='127.0.0.1',
49+
server_port=23333,
50+
allow_origins=['*'],
51+
allow_credentials=False,
52+
allow_methods=['*'],
53+
allow_headers=['*'],
54+
allow_terminate_by_client=False,
55+
enable_abort_handling=False,
56+
log_level='info',
57+
api_keys=None,
58+
ssl=None,
59+
proxy_url=None,
60+
max_log_len=None,
61+
disable_fastapi_docs=False,
62+
max_concurrent_requests=None,
63+
reasoning_parser='qwen-qwq',
64+
tool_call_parser='qwen3coder',
65+
hf_overrides=None)
66+
args.update(overrides)
67+
return SimpleNamespace(**args)
68+
69+
70+
def test_api_server_turbomind_forwards_hybrid_prefix_cache_options(monkeypatch):
71+
captured = {}
72+
fake_api_server = ModuleType('lmdeploy.serve.openai.api_server')
73+
74+
def fake_serve(model_path, **kwargs):
75+
captured['model_path'] = model_path
76+
captured.update(kwargs)
77+
78+
fake_api_server.serve = fake_serve
79+
80+
monkeypatch.setitem(sys.modules, 'lmdeploy.serve.openai.api_server', fake_api_server)
81+
monkeypatch.setattr('lmdeploy.archs.autoget_backend', lambda _: 'turbomind')
82+
monkeypatch.setattr(serve_module, 'get_max_batch_size', lambda device: 13)
83+
monkeypatch.setattr(serve_module, 'get_chat_template', lambda *_: None)
84+
monkeypatch.setattr(serve_module, 'get_speculative_config', lambda _: None)
85+
86+
serve_module.SubCliServe.api_server(_make_api_server_args())
87+
88+
assert captured['backend'] == 'turbomind'
89+
assert captured['model_path'] == 'QuantTrio/Qwen3.5-27B-AWQ'
90+
assert isinstance(captured['backend_config'], TurbomindEngineConfig)
91+
assert captured['backend_config'].enable_prefix_caching is True
92+
assert captured['backend_config'].linear_prefix_cache_interval_blocks == 4
93+
94+
95+
def test_api_server_turbomind_uses_default_cuda_batch_size(monkeypatch):
96+
captured = {}
97+
fake_api_server = ModuleType('lmdeploy.serve.openai.api_server')
98+
99+
def fake_serve(model_path, **kwargs):
100+
captured['model_path'] = model_path
101+
captured.update(kwargs)
102+
103+
fake_api_server.serve = fake_serve
104+
105+
monkeypatch.setitem(sys.modules, 'lmdeploy.serve.openai.api_server', fake_api_server)
106+
monkeypatch.setattr('lmdeploy.archs.autoget_backend', lambda _: 'turbomind')
107+
monkeypatch.setattr(serve_module, 'get_max_batch_size', lambda device: 7)
108+
monkeypatch.setattr(serve_module, 'get_chat_template', lambda *_: None)
109+
monkeypatch.setattr(serve_module, 'get_speculative_config', lambda _: None)
110+
111+
serve_module.SubCliServe.api_server(_make_api_server_args(max_batch_size=None))
112+
113+
assert captured['backend_config'].max_batch_size == 7
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pytest
2+
from pydantic import ValidationError
3+
4+
from lmdeploy import TurbomindEngineConfig
5+
6+
7+
def test_linear_prefix_cache_interval_blocks_default():
8+
config = TurbomindEngineConfig(enable_prefix_caching=True)
9+
assert config.linear_prefix_cache_interval_blocks == 2
10+
11+
12+
def test_linear_prefix_cache_interval_blocks_validation():
13+
with pytest.raises(ValidationError, match='invalid linear_prefix_cache_interval_blocks'):
14+
TurbomindEngineConfig(linear_prefix_cache_interval_blocks=0)
15+
16+
17+
def test_linear_prefix_cache_interval_blocks_override():
18+
config = TurbomindEngineConfig(enable_prefix_caching=True, linear_prefix_cache_interval_blocks=4)
19+
assert config.linear_prefix_cache_interval_blocks == 4

0 commit comments

Comments
 (0)