|
| 1 | +import sys |
| 2 | +from types import ModuleType, SimpleNamespace |
| 3 | + |
| 4 | +from lmdeploy.cli import serve as serve_module |
| 5 | +from lmdeploy.messages import TurbomindEngineConfig |
| 6 | + |
| 7 | + |
| 8 | +def _make_api_server_args(**overrides): |
| 9 | + args = dict(model_path='QuantTrio/Qwen3.5-27B-AWQ', |
| 10 | + model_name='qwen35-awq', |
| 11 | + backend='turbomind', |
| 12 | + dtype='auto', |
| 13 | + tp=2, |
| 14 | + dp=1, |
| 15 | + ep=1, |
| 16 | + cp=1, |
| 17 | + nnodes=1, |
| 18 | + node_rank=0, |
| 19 | + dist_init_addr=None, |
| 20 | + max_batch_size=None, |
| 21 | + session_len=4096, |
| 22 | + model_format='awq', |
| 23 | + quant_policy=8, |
| 24 | + rope_scaling_factor=0.0, |
| 25 | + cache_max_entry_count=0.8, |
| 26 | + cache_block_seq_len=64, |
| 27 | + enable_prefix_caching=True, |
| 28 | + linear_prefix_cache_interval_blocks=4, |
| 29 | + max_prefill_token_num=8192, |
| 30 | + num_tokens_per_iter=0, |
| 31 | + max_prefill_iters=1, |
| 32 | + async_=1, |
| 33 | + communicator='nccl', |
| 34 | + disable_metrics=False, |
| 35 | + adapters=None, |
| 36 | + device='cuda', |
| 37 | + eager_mode=False, |
| 38 | + disable_vision_encoder=False, |
| 39 | + logprobs_mode='raw_logits', |
| 40 | + dllm_block_length=64, |
| 41 | + dllm_unmasking_strategy='low_confidence_dynamic', |
| 42 | + dllm_denoising_steps=0, |
| 43 | + dllm_confidence_threshold=0.0, |
| 44 | + enable_return_routed_experts=False, |
| 45 | + distributed_executor_backend=None, |
| 46 | + chat_template=None, |
| 47 | + vision_max_batch_size=1, |
| 48 | + server_name='127.0.0.1', |
| 49 | + server_port=23333, |
| 50 | + allow_origins=['*'], |
| 51 | + allow_credentials=False, |
| 52 | + allow_methods=['*'], |
| 53 | + allow_headers=['*'], |
| 54 | + allow_terminate_by_client=False, |
| 55 | + enable_abort_handling=False, |
| 56 | + log_level='info', |
| 57 | + api_keys=None, |
| 58 | + ssl=None, |
| 59 | + proxy_url=None, |
| 60 | + max_log_len=None, |
| 61 | + disable_fastapi_docs=False, |
| 62 | + max_concurrent_requests=None, |
| 63 | + reasoning_parser='qwen-qwq', |
| 64 | + tool_call_parser='qwen3coder', |
| 65 | + hf_overrides=None) |
| 66 | + args.update(overrides) |
| 67 | + return SimpleNamespace(**args) |
| 68 | + |
| 69 | + |
| 70 | +def test_api_server_turbomind_forwards_hybrid_prefix_cache_options(monkeypatch): |
| 71 | + captured = {} |
| 72 | + fake_api_server = ModuleType('lmdeploy.serve.openai.api_server') |
| 73 | + |
| 74 | + def fake_serve(model_path, **kwargs): |
| 75 | + captured['model_path'] = model_path |
| 76 | + captured.update(kwargs) |
| 77 | + |
| 78 | + fake_api_server.serve = fake_serve |
| 79 | + |
| 80 | + monkeypatch.setitem(sys.modules, 'lmdeploy.serve.openai.api_server', fake_api_server) |
| 81 | + monkeypatch.setattr('lmdeploy.archs.autoget_backend', lambda _: 'turbomind') |
| 82 | + monkeypatch.setattr(serve_module, 'get_max_batch_size', lambda device: 13) |
| 83 | + monkeypatch.setattr(serve_module, 'get_chat_template', lambda *_: None) |
| 84 | + monkeypatch.setattr(serve_module, 'get_speculative_config', lambda _: None) |
| 85 | + |
| 86 | + serve_module.SubCliServe.api_server(_make_api_server_args()) |
| 87 | + |
| 88 | + assert captured['backend'] == 'turbomind' |
| 89 | + assert captured['model_path'] == 'QuantTrio/Qwen3.5-27B-AWQ' |
| 90 | + assert isinstance(captured['backend_config'], TurbomindEngineConfig) |
| 91 | + assert captured['backend_config'].enable_prefix_caching is True |
| 92 | + assert captured['backend_config'].linear_prefix_cache_interval_blocks == 4 |
| 93 | + |
| 94 | + |
| 95 | +def test_api_server_turbomind_uses_default_cuda_batch_size(monkeypatch): |
| 96 | + captured = {} |
| 97 | + fake_api_server = ModuleType('lmdeploy.serve.openai.api_server') |
| 98 | + |
| 99 | + def fake_serve(model_path, **kwargs): |
| 100 | + captured['model_path'] = model_path |
| 101 | + captured.update(kwargs) |
| 102 | + |
| 103 | + fake_api_server.serve = fake_serve |
| 104 | + |
| 105 | + monkeypatch.setitem(sys.modules, 'lmdeploy.serve.openai.api_server', fake_api_server) |
| 106 | + monkeypatch.setattr('lmdeploy.archs.autoget_backend', lambda _: 'turbomind') |
| 107 | + monkeypatch.setattr(serve_module, 'get_max_batch_size', lambda device: 7) |
| 108 | + monkeypatch.setattr(serve_module, 'get_chat_template', lambda *_: None) |
| 109 | + monkeypatch.setattr(serve_module, 'get_speculative_config', lambda _: None) |
| 110 | + |
| 111 | + serve_module.SubCliServe.api_server(_make_api_server_args(max_batch_size=None)) |
| 112 | + |
| 113 | + assert captured['backend_config'].max_batch_size == 7 |
0 commit comments