Skip to content

Commit de4feff

Browse files
ltd0924yuanlehomegongshaotian
authored
[Feature]CP support data clear (#4214)
* Update serving_chat.py * Update serving_completion.py * Update serving_completion.py * mv connection_manager init * [BugFix] fix kv cache * fix format * [Feature] support clear data --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com> Co-authored-by: RAM <gstian5555@outlook.com>
1 parent f38b174 commit de4feff

10 files changed

Lines changed: 65 additions & 0 deletions

File tree

fastdeploy/engine/common_engine.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,18 @@ def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor
801801
def check_and_free_block_tables(self):
802802
self.resource_manager.check_and_free_block_tables()
803803

804+
def clear_data(self):
805+
try:
806+
llm_logger.info("Clear Data: Start")
807+
self.token_processor.clear_data()
808+
self.engine_worker_queue.clear_data()
809+
self.zmq_server.req_dict.clear()
810+
llm_logger.info("Clear Data: Successfully")
811+
return True
812+
except Exception as e:
813+
llm_logger.error(f"Clear data error: {e}")
814+
return False
815+
804816
def _exit_sub_services(self):
805817
"""
806818
exit sub services

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,10 @@ def _free_blocks(self, request: Request):
512512
def finish_requests_async(self, request_ids: Union[str, Iterable[str]]):
513513
return self.finish_execution_pool.submit(self.finish_requests, request_ids)
514514

515+
def clear_data(self):
516+
self.waiting: deque[Request] = deque()
517+
self.to_be_rescheduled_request_id_set = set()
518+
515519
def finish_requests(self, request_ids: Union[str, Iterable[str]]):
516520
llm_logger.info(f"recycle resources for requests: {request_ids}")
517521
try:

fastdeploy/entrypoints/engine_client.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ def create_zmq_client(self, model, mode):
141141
self.zmq_client = ZmqIpcClient(model, mode)
142142
self.zmq_client.connect()
143143

144+
def check_model_weight_status(self):
145+
return self.model_weights_status_signal.value[0] < 0
146+
144147
async def format_and_add_data(self, prompts: dict):
145148
"""
146149
Format the request data and send the request to the server.

fastdeploy/entrypoints/openai/api_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ def reset_scheduler():
480480

481481
if llm_engine is None:
482482
return Response("Engine not loaded", status_code=500)
483+
llm_engine.engine.clear_data()
483484
llm_engine.engine.scheduler.reset()
484485
return Response("Scheduler Reset Successfully", status_code=200)
485486

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ async def chat_completion_stream_generator(
210210
decoder_base_url=self.tokenizer_base_url,
211211
)
212212
while num_choices > 0:
213+
if self.engine_client.check_model_weight_status():
214+
raise ValueError("Engine is clearing model weight")
213215
try:
214216
response = await asyncio.wait_for(response_queue.get(), timeout=10)
215217
current_waiting_time = 0
@@ -425,6 +427,8 @@ async def chat_completion_full_generator(
425427
decoder_base_url=self.tokenizer_base_url,
426428
)
427429
while True:
430+
if self.engine_client.check_model_weight_status():
431+
raise ValueError("Engine is clearing model weight")
428432
try:
429433
response = await asyncio.wait_for(response_queue.get(), timeout=10)
430434
current_waiting_time = 0

fastdeploy/entrypoints/openai/serving_completion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ async def completion_full_generator(
216216
completion_batched_token_ids = [[] for _ in range(num_choices)]
217217
current_waiting_time = 0
218218
while num_choices > 0:
219+
if self.engine_client.check_model_weight_status():
220+
raise ValueError("Engine is clearing model weight")
219221
try:
220222
response = await asyncio.wait_for(response_queue.get(), timeout=10)
221223
current_waiting_time = 0
@@ -333,6 +335,8 @@ async def completion_stream_generator(
333335
)
334336
current_waiting_time = 0
335337
while num_choices > 0:
338+
if self.engine_client.check_model_weight_status():
339+
raise ValueError("Engine is clearing model weight")
336340
try:
337341
response = await asyncio.wait_for(response_queue.get(), timeout=10)
338342
current_waiting_time = 0

fastdeploy/inter_communicator/engine_worker_queue.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,13 @@ def get_disaggregated_tasks(self):
392392
llm_logger.debug("get tasks from queue success")
393393
return item
394394

395+
def clear_data(self):
396+
self.lock.acquire()
397+
self.tasks[:] = list()
398+
self.client_read_flag[:] = [1] * self.num_client
399+
self.lock.release()
400+
llm_logger.info("clear data for engine worker queue")
401+
395402
def cleanup(self):
396403
"""
397404
Exit the worker queue gracefully.

fastdeploy/output/token_processor.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,31 @@ def _record_completion_metrics(self, task, current_time):
464464
main_process_metrics.request_inference_time.observe(current_time - task.inference_start_time)
465465
main_process_metrics.request_generation_tokens.observe(self.tokens_counter[task.request_id])
466466

467+
def clear_data(self):
468+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
469+
self.resource_manager.clear_data()
470+
for i in range(self.cfg.max_num_seqs):
471+
if self.resource_manager.stop_flags[i]:
472+
continue
473+
task = self.resource_manager.tasks_list[i]
474+
result = RequestOutput(
475+
request_id=task.request_id,
476+
outputs=CompletionOutput(
477+
index=i,
478+
send_idx=self.tokens_counter[task.request_id],
479+
token_ids=task.eos_token_ids,
480+
draft_token_ids=[],
481+
),
482+
finished=True,
483+
metrics=RequestMetrics(
484+
arrival_time=time.time(),
485+
request_start_time=task.arrival_time,
486+
),
487+
)
488+
is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill"
489+
self._recycle_resources(task.request_id, i, task, result, is_prefill)
490+
llm_logger.warning(f"clear data for task {task.request_id}")
491+
467492
def _record_speculative_decoding_mertics(self, accept_num):
468493
"""Record metrics of speculative decoding"""
469494
if not hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):

fastdeploy/rl/dynamic_weight_manager.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ def check_model_weights_status(model_weights_status, model_runner, pid):
228228
logger.info("finished loading new checkpoint")
229229
elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING:
230230
logger.info("infer engine stopped! start to clear checkpoint...")
231+
model_runner.clear_requests()
231232
model_runner.clear_parameters(pid)
232233
while model_weights_status.value[0] != ModelWeightsStatus.CLEARED:
233234
time.sleep(0.01)

fastdeploy/worker/gpu_model_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,6 +1704,10 @@ def clear_cache(self):
17041704
self.forward_meta.clear_caches()
17051705
paddle.device.cuda.empty_cache()
17061706

1707+
def clear_requests(self):
1708+
"""Dynamic model loader use to clear requests use for RL"""
1709+
self.share_inputs["stop_flags"][:] = True
1710+
17071711
def clear_parameters(self, pid):
17081712
"""Dynamic model loader use to clear parameters use for RL"""
17091713
# Clear CUDAGraph

0 commit comments

Comments
 (0)