Skip to content

Commit 7f9a9b3

Browse files
K11OntheBoatK11OntheBoat
andauthored
Support limit thinking lengths (#4070)
Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
1 parent b41988f commit 7f9a9b3

8 files changed

Lines changed: 184 additions & 26 deletions

File tree

fastdeploy/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def __init__(
130130
self.quantization = None
131131
self.pad_token_id: int = -1
132132
self.eos_tokens_lens: int = 2
133+
self.think_end_id = None
133134
self.lm_head_fp32: bool = False
134135
self.model_format = "auto"
135136
self.partial_rotary_factor: float = 1.0

fastdeploy/entrypoints/engine_client.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,6 @@ async def add_requests(self, task):
177177
task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
178178
input_ids_len = task["prompt_token_ids_len"]
179179
task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
180-
if task.get("reasoning_max_tokens", None) is None:
181-
task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1)
182180
min_tokens = task.get("min_tokens", 1)
183181
if "messages" in task:
184182
del task["messages"]

fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,10 @@ def process_request_dict(self, request, max_model_len=None):
255255
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
256256
if request.get("max_tokens") is None:
257257
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
258+
else:
259+
request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"])
260+
if request.get("reasoning_max_tokens") is None:
261+
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
258262
data_processor_logger.info(f"Processed request {request}")
259263

260264
return request

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def post_process_normal(
166166
) -> ModelRunnerOutput:
167167
"""Post-processing steps after completing a single token generation."""
168168
# handle vl:
169-
if model_output.enable_thinking:
169+
if model_output.enable_thinking and model_output.think_end_id is not None:
170170
exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id
171171
paddle.assign(
172172
paddle.where(

fastdeploy/worker/gpu_model_runner.py

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -265,15 +265,21 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
265265
else:
266266
position_ids = None
267267

268-
enable_thinking = request.get("enable_thinking", True)
269-
enable_thinking = enable_thinking if enable_thinking is not None else True
270-
self.share_inputs["enable_thinking"][:] = enable_thinking
271-
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
272-
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
273268
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
274269
position_ids, request.get("max_tokens", 2048)
275270
)
276271

272+
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None:
273+
# Enable thinking
274+
self.share_inputs["enable_thinking"][:] = True
275+
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
276+
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
277+
else:
278+
# Disable thinking
279+
self.share_inputs["enable_thinking"][:] = False
280+
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
281+
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
282+
277283
if isinstance(request.prompt_token_ids, np.ndarray):
278284
prompt_token_ids = request.prompt_token_ids.tolist()
279285
else:
@@ -495,16 +501,22 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
495501
self.share_inputs["prompt_lens"][idx : idx + 1] = length
496502

497503
if self.enable_mm:
498-
enable_thinking = request.get("enable_thinking", True)
499-
enable_thinking = enable_thinking if enable_thinking is not None else True
500-
self.share_inputs["enable_thinking"][:] = enable_thinking
501-
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
502-
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
503504
self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
504505
position_ids, request.get("max_tokens", 2048)
505506
)
506507
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
507508

509+
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None:
510+
# Enable thinking
511+
self.share_inputs["enable_thinking"][:] = True
512+
self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
513+
self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
514+
else:
515+
# Disable thinking
516+
self.share_inputs["enable_thinking"][:] = False
517+
self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
518+
self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
519+
508520
def get_attr_from_request(request, attr, default_value=None):
509521
res = request.get(attr, default_value)
510522
if res is not None:
@@ -735,6 +747,11 @@ def _init_share_inputs(self, max_num_seqs: int):
735747
# Initialize rotary position embedding
736748
tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
737749

750+
# Initialize thinking related buffers
751+
self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
752+
self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=False, dtype="bool")
753+
self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
754+
738755
# TODO(gongshaotian): move to models
739756
if not self.enable_mm:
740757
self.share_inputs["rope_emb"] = get_rope(
@@ -827,11 +844,6 @@ def _init_share_inputs(self, max_num_seqs: int):
827844
dtype="float32",
828845
)
829846
self.share_inputs["image_features"] = None
830-
self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
831-
self.share_inputs["enable_thinking"] = paddle.full(
832-
shape=[1], fill_value=("ernie" in self.model_config.model_type), dtype="bool"
833-
)
834-
self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
835847

836848
def _prepare_inputs(self) -> None:
837849
"""Prepare the model inputs"""
@@ -1220,10 +1232,10 @@ def _dummy_run(
12201232
),
12211233
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
12221234
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
1223-
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
1224-
think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1),
1225-
need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
1226-
reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
1235+
enable_thinking=self.share_inputs["enable_thinking"],
1236+
think_end_id=self.model_config.think_end_id,
1237+
need_think_end=self.share_inputs["need_think_end"],
1238+
reasoning_index=self.share_inputs["reasoning_index"],
12271239
stop_token_ids=self.share_inputs["stop_seqs"],
12281240
stop_seqs_len=self.share_inputs["stop_seqs_len"],
12291241
)
@@ -1515,10 +1527,10 @@ class at the server level, which is too granular for ModelRunner.
15151527
),
15161528
accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
15171529
accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
1518-
enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
1519-
think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1),
1520-
need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
1521-
reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
1530+
enable_thinking=self.share_inputs["enable_thinking"],
1531+
think_end_id=self.model_config.think_end_id,
1532+
need_think_end=self.share_inputs["need_think_end"][:num_running_requests],
1533+
reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests],
15221534
stop_token_ids=self.share_inputs["stop_seqs"],
15231535
stop_seqs_len=self.share_inputs["stop_seqs_len"],
15241536
)

fastdeploy/worker/worker_process.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,28 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
129129
fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel
130130

131131

132+
def update_think_end_id_for_ernie(fd_config: FDConfig) -> None:
133+
"""
134+
Updates the think_end_id in the model config. Uses the ID of '</think>'
135+
if it exists, otherwise defaults to None.
136+
"""
137+
is_ernie = ErnieArchitectures.contains_ernie_arch(fd_config.model_config.architectures)
138+
if current_platform.is_cuda() and is_ernie:
139+
tokenizer = Ernie4_5Tokenizer.from_pretrained(
140+
fd_config.model_config.model,
141+
model_max_length=fd_config.parallel_config.max_model_len,
142+
padding_side="right",
143+
use_fast=False,
144+
)
145+
146+
vocab = tokenizer.get_vocab()
147+
fd_config.model_config.think_end_id = vocab.get("</think>", None)
148+
if fd_config.model_config.think_end_id is not None:
149+
logger.info(f"Get think_end_id {fd_config.model_config.think_end_id} from vocab.")
150+
else:
151+
logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
152+
153+
132154
class PaddleDisWorkerProc:
133155
"""
134156
Paddle Distributed wrapper for fastdeploy.worker.Worker,
@@ -771,6 +793,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
771793
plas_attention_config=plas_attention_config,
772794
)
773795
update_fd_config_for_mm(fd_config)
796+
update_think_end_id_for_ernie(fd_config)
774797

775798
return fd_config
776799

tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,3 +580,63 @@ def test_profile_reset_block_num():
580580
f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
581581
f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
582582
)
583+
584+
585+
def test_thinking_logic_flag(openai_client, capsys):
586+
"""
587+
Test the interaction between token calculation logic and conditional thinking.
588+
This test covers:
589+
1. Default max_tokens calculation when not provided.
590+
2. Capping of max_tokens when it exceeds model limits.
591+
3. Default reasoning_max_tokens calculation when not provided.
592+
4. Activation of thinking based on the final state of reasoning_max_tokens.
593+
"""
594+
595+
response_case_1 = openai_client.chat.completions.create(
596+
model="default",
597+
messages=[{"role": "user", "content": "Explain gravity briefly."}],
598+
temperature=1,
599+
stream=False,
600+
extra_body={
601+
"chat_template_kwargs": {"enable_thinking": True},
602+
},
603+
)
604+
assert response_case_1.choices[0].message.reasoning_content is not None
605+
606+
response_case_2 = openai_client.chat.completions.create(
607+
model="default",
608+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
609+
temperature=1,
610+
stream=False,
611+
max_tokens=20,
612+
extra_body={
613+
"chat_template_kwargs": {"enable_thinking": True},
614+
"reasoning_max_tokens": 5,
615+
},
616+
)
617+
assert response_case_2.choices[0].message.reasoning_content is not None
618+
619+
response_case_3 = openai_client.chat.completions.create(
620+
model="default",
621+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
622+
temperature=1,
623+
stream=False,
624+
max_tokens=20,
625+
extra_body={
626+
"chat_template_kwargs": {"enable_thinking": True},
627+
"reasoning_max_tokens": None,
628+
},
629+
)
630+
assert response_case_3.choices[0].message.reasoning_content is not None
631+
632+
response_case_4 = openai_client.chat.completions.create(
633+
model="default",
634+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
635+
temperature=1,
636+
stream=False,
637+
max_tokens=20,
638+
extra_body={
639+
"chat_template_kwargs": {"enable_thinking": False},
640+
},
641+
)
642+
assert response_case_4.choices[0].message.reasoning_content is None

tests/e2e/test_EB_VL_Lite_serving.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,3 +592,63 @@ def test_profile_reset_block_num():
592592
f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
593593
f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
594594
)
595+
596+
597+
def test_thinking_logic_flag(openai_client, capsys):
598+
"""
599+
Test the interaction between token calculation logic and conditional thinking.
600+
This test covers:
601+
1. Default max_tokens calculation when not provided.
602+
2. Capping of max_tokens when it exceeds model limits.
603+
3. Default reasoning_max_tokens calculation when not provided.
604+
4. Activation of thinking based on the final state of reasoning_max_tokens.
605+
"""
606+
607+
response_case_1 = openai_client.chat.completions.create(
608+
model="default",
609+
messages=[{"role": "user", "content": "Explain gravity briefly."}],
610+
temperature=1,
611+
stream=False,
612+
extra_body={
613+
"chat_template_kwargs": {"enable_thinking": True},
614+
},
615+
)
616+
assert response_case_1.choices[0].message.reasoning_content is not None
617+
618+
response_case_2 = openai_client.chat.completions.create(
619+
model="default",
620+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
621+
temperature=1,
622+
stream=False,
623+
max_tokens=20,
624+
extra_body={
625+
"chat_template_kwargs": {"enable_thinking": True},
626+
"reasoning_max_tokens": 5,
627+
},
628+
)
629+
assert response_case_2.choices[0].message.reasoning_content is not None
630+
631+
response_case_3 = openai_client.chat.completions.create(
632+
model="default",
633+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
634+
temperature=1,
635+
stream=False,
636+
max_tokens=20,
637+
extra_body={
638+
"chat_template_kwargs": {"enable_thinking": True},
639+
"reasoning_max_tokens": None,
640+
},
641+
)
642+
assert response_case_3.choices[0].message.reasoning_content is not None
643+
644+
response_case_4 = openai_client.chat.completions.create(
645+
model="default",
646+
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
647+
temperature=1,
648+
stream=False,
649+
max_tokens=20,
650+
extra_body={
651+
"chat_template_kwargs": {"enable_thinking": False},
652+
},
653+
)
654+
assert response_case_4.choices[0].message.reasoning_content is None

0 commit comments

Comments
 (0)