Skip to content

Commit 6d0d404

Browse files
authored
[Cherry-Pick][Feature] Added the /v1/abort_requests endpoint(#6992) (#7093)
* abort requests * add finish_reason
1 parent 474174b commit 6d0d404

13 files changed

Lines changed: 498 additions & 6 deletions

File tree

docs/online_serving/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,3 +577,4 @@ DeltaFunctionCall:
577577
- `/v1/pause` - Pause generation (causes denial of service). Inflight requests are aborted and cache is reset.
578578
- `/v1/resume` - Resume generation.
579579
- `/v1/is_paused` - Check if generation is paused.
580+
- `/v1/abort_requests` - Abort inference requests to release GPU memory (KV Cache blocks) and compute resources. Accepts `req_ids` (list of request IDs) or `abort_all=true` (abort all requests). Returns the list of aborted requests with their generated token counts.

docs/online_serving/router.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ The Router exposes a set of HTTP services to provide unified request scheduling,
151151
|----------|------|------|
152152
| POST | `/v1/chat/completions` | Provide scheduling services for inference requests based on the Chat Completions API |
153153
| POST | `/v1/completions` | Provide scheduling services for general text completion inference requests |
154+
| POST | `/v1/abort_requests` | Abort inference requests to release GPU memory and compute resources. Accepts `req_ids` or `abort_all=true`. Returns aborted requests with their generated token counts |
154155
| POST | `/register` | Allow inference instances to register their metadata with the Router for scheduling |
155156
| GET | `/registered` | Query the list of currently registered inference instances |
156157
| GET | `/registered_number` | Query the number of currently registered inference instances |

docs/zh/online_serving/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,3 +563,4 @@ DeltaFunctionCall:
563563
/v1/pause - 暂停推理生成(会导致服务拒绝推理请求)。正在进行中的请求会被中止,缓存会被重置。
564564
/v1/resume - 恢复推理生成。
565565
/v1/is_paused - 检查推理生成是否已暂停。
566+
/v1/abort_requests - 中断推理请求,释放 GPU 显存(KV Cache blocks)和计算资源。支持传入 `req_ids`(请求 ID 列表)或 `abort_all=true`(中断所有请求)。返回已中断请求列表及其已生成的 token 数。

docs/zh/online_serving/router.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ Router 通过 HTTP 接口对外提供统一的调度服务,同时支持运行
153153
|----------|------|------|
154154
| POST | `/v1/chat/completions` | 对外提供基于 Chat 接口的推理请求调度服务 |
155155
| POST | `/v1/completions` | 对外提供通用文本补全请求的调度服务 |
156+
| POST | `/v1/abort_requests` | 中断推理请求,释放 GPU 显存和计算资源。支持传入 `req_ids``abort_all=true`,返回已中断请求列表及其已生成的 token 数 |
156157
| POST | `/register` | 推理实例向 Router 注册自身信息,用于参与调度 |
157158
| GET | `/registered` | 查询当前已注册的推理实例列表 |
158159
| GET | `/registered_number` | 查询当前已注册的推理实例数量 |

fastdeploy/engine/common_engine.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,11 @@
4343
from fastdeploy.cache_manager.cache_data import CacheStatus
4444
from fastdeploy.config import FDConfig
4545
from fastdeploy.engine.request import (
46+
CompletionOutput,
4647
ControlRequest,
4748
ControlResponse,
4849
Request,
50+
RequestMetrics,
4951
RequestOutput,
5052
RequestStatus,
5153
RequestType,
@@ -1413,6 +1415,139 @@ def _control_update_weights(self, control_request: ControlRequest) -> Optional[d
14131415
raise Exception(error_msg)
14141416
return self._call_worker(control_request, 60)
14151417

1418+
def _control_abort_requests(self, control_req: ControlRequest):
1419+
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
1420+
raise Exception("abort_requests only supported in ENABLE_V1_KVCACHE_SCHEDULER")
1421+
args = control_req.get_args()
1422+
abort_all = args.get("abort_all", False)
1423+
req_ids = args.get("req_ids", [])
1424+
matched_input_ids = set()
1425+
now_reqs = list(set(self.resource_manager.requests.keys()) | set(self.scheduler.requests.keys()))
1426+
1427+
# Step 1: Determine target request list
1428+
if abort_all:
1429+
# all requests in running + waiting
1430+
target_req_ids = now_reqs
1431+
else:
1432+
# filter out requests that actually exist
1433+
target_req_ids = []
1434+
for rid in req_ids:
1435+
if rid in now_reqs:
1436+
target_req_ids.append(rid)
1437+
matched_input_ids.add(rid)
1438+
elif f"{rid}_0" in now_reqs:
1439+
target_req_ids.append(f"{rid}_0")
1440+
matched_input_ids.add(rid)
1441+
1442+
if not target_req_ids:
1443+
return {"aborted": [], "not_found": req_ids if not abort_all else []}
1444+
1445+
# Step 2: Collect partial results
1446+
aborted_info = []
1447+
results = []
1448+
for req_id in target_req_ids:
1449+
request = self.resource_manager.requests.get(req_id)
1450+
if request is None:
1451+
scheduled_req = self.scheduler.requests.get(req_id)
1452+
if scheduled_req is None:
1453+
continue
1454+
request = scheduled_req.raw
1455+
1456+
partial_token_ids = list(request.output_token_ids)
1457+
1458+
# Construct finished response with partial results
1459+
now = time.time()
1460+
abort_metrics = RequestMetrics(
1461+
arrival_time=request.metrics.arrival_time if request.metrics else now,
1462+
inference_start_time=request.metrics.inference_start_time if request.metrics else now,
1463+
engine_recv_latest_token_time=now,
1464+
engine_recv_first_token_time=request.metrics.engine_recv_first_token_time if request.metrics else now,
1465+
request_start_time=request.metrics.arrival_time if request.metrics else now,
1466+
)
1467+
result = RequestOutput(
1468+
request_id=req_id,
1469+
finished=True,
1470+
outputs=CompletionOutput(
1471+
index=0,
1472+
send_idx=len(partial_token_ids),
1473+
token_ids=[self.data_processor.eos_token_ids[0]],
1474+
),
1475+
metrics=abort_metrics,
1476+
error_code=200,
1477+
error_msg="Aborted",
1478+
)
1479+
results.append(result)
1480+
aborted_info.append(
1481+
{
1482+
"request_id": req_id,
1483+
"output_token_count": len(partial_token_ids),
1484+
}
1485+
)
1486+
1487+
# Step 3: Execute abort — add all requests to waiting_abort_req_id_set
1488+
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
1489+
for req_id in target_req_ids:
1490+
self.resource_manager.add_abort_req_ids(req_id)
1491+
time.sleep(0.0001)
1492+
if self.cfg.scheduler_config.splitwise_role != "prefill":
1493+
self._wait_abort_complete(target_req_ids)
1494+
1495+
# Add results to scheduler, engine will have a thread calling get_results,
1496+
# then cleanup and call send_response to send to client.
1497+
# When client disconnects, send_response will automatically ignore
1498+
if self.cfg.scheduler_config.splitwise_role != "prefill":
1499+
try:
1500+
# self.send_response_server.send_response(req_id, [result])
1501+
self.scheduler.put_results(results)
1502+
except Exception:
1503+
pass # client may have disconnected
1504+
1505+
not_found = [rid for rid in req_ids if rid not in matched_input_ids] if not abort_all else []
1506+
1507+
return {"aborted": aborted_info, "not_found": not_found}
1508+
1509+
def _wait_abort_complete(self, target_req_ids, stall_timeout=1):
1510+
"""
1511+
Wait for all abort requests to complete.
1512+
- Keep monitoring as long as remaining is not empty, which means cleanup is not done yet
1513+
- If no progress within stall_timeout seconds, force cleanup requests stuck in to_be_aborted_req_id_set,
1514+
reset progress state if any, then continue monitoring
1515+
"""
1516+
target_set = set(target_req_ids)
1517+
prev_remaining_count = len(target_set)
1518+
last_progress_time = time.time()
1519+
remaining = target_set & self.resource_manager.get_reqs_in_aborting()
1520+
while remaining:
1521+
remaining = target_set & self.resource_manager.get_reqs_in_aborting()
1522+
if not remaining:
1523+
self.llm_logger.info(f"all {len(target_set)} abort reqs cleaned")
1524+
return
1525+
1526+
current_count = len(remaining)
1527+
if current_count < prev_remaining_count:
1528+
# progress made: recycle_abort_task was called
1529+
self.llm_logger.info(f"abort progress: {prev_remaining_count} -> {current_count}")
1530+
last_progress_time = time.time()
1531+
prev_remaining_count = current_count
1532+
1533+
if time.time() - last_progress_time > stall_timeout:
1534+
# no progress timeout: only cleanup requests stuck in to_be_aborted (worker hasn't returned -9)
1535+
stuck = remaining & self.resource_manager.to_be_aborted_req_id_set
1536+
if stuck:
1537+
self.llm_logger.warning(
1538+
f"no abort progress for {stall_timeout}s, "
1539+
f"force cleanup {len(stuck)} stuck requests (in to_be_aborted)"
1540+
)
1541+
for req_id in list(stuck):
1542+
self.llm_logger.warning(f"force cleanup stuck req_id:{req_id}")
1543+
self.resource_manager.recycle_abort_task(req_id)
1544+
# reset progress state
1545+
last_progress_time = time.time()
1546+
prev_remaining_count = current_count - len(stuck)
1547+
# else: remaining are all in waiting_abort_req_id_set, waiting for natural flow
1548+
1549+
time.sleep(0.005)
1550+
14161551
def _parse_tags(self, control_request: ControlRequest):
14171552
"""
14181553
Parse tags from control request.

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ def recycle_abort_task(self, request_id):
279279
del self.requests[request_id]
280280
del self.req_dict[request_id]
281281
self.to_be_aborted_req_id_set.remove(request_id)
282+
self.update_metrics()
282283

283284
def _trigger_abort(self, request_id, scheduled_reqs):
284285
if request_id in self.requests:
@@ -1120,6 +1121,9 @@ def download_bos_features(bos_client, features_urls):
11201121
return None
11211122
inputs["audio_features"] = result
11221123

1124+
def get_reqs_in_aborting(self):
1125+
return self.waiting_abort_req_id_set | self.to_be_aborted_req_id_set
1126+
11231127
def get_available_position(self) -> int:
11241128
position = 0
11251129
while position < self.max_num_seqs:

fastdeploy/entrypoints/openai/api_server.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,25 @@ async def update_weights(request: Request) -> Response:
473473
return control_response.to_api_json_response()
474474

475475

476+
@app.post("/v1/abort_requests")
477+
async def abort_requests(request: Request):
478+
body = await request.json()
479+
abort_all = body.get("abort_all", False)
480+
req_ids = body.get("req_ids", None)
481+
482+
# 参数校验
483+
if not abort_all and not req_ids:
484+
return JSONResponse(status_code=400, content={"error": "must provide abort_all=true or req_ids"})
485+
486+
control_request = ControlRequest(
487+
request_id=f"control-{uuid.uuid4()}",
488+
method="abort_requests",
489+
args={"abort_all": abort_all, "req_ids": req_ids or []},
490+
)
491+
control_response = await app.state.engine_client.run_control_method(control_request)
492+
return control_response.to_api_json_response()
493+
494+
476495
def wrap_streaming_generator(original_generator: AsyncGenerator):
477496
"""
478497
Wrap an async generator to release the connection semaphore when the generator is finished.

fastdeploy/entrypoints/openai/protocol.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ class ChatCompletionResponseChoice(BaseModel):
268268
logprobs: Optional[LogProbs] = None
269269
draft_logprobs: Optional[LogProbs] = None
270270
prompt_logprobs: Optional[PromptLogprobs] = None
271-
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
271+
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop", "abort"]]
272272
speculate_metrics: Optional[SpeculateMetrics] = None
273273

274274

@@ -333,7 +333,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
333333
logprobs: Optional[LogProbs] = None
334334
draft_logprobs: Optional[LogProbs] = None
335335
prompt_logprobs: Optional[PromptLogprobs] = None
336-
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] = None
336+
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop", "abort"]] = None
337337
arrival_time: Optional[float] = None
338338
speculate_metrics: Optional[SpeculateMetrics] = None
339339

@@ -369,7 +369,7 @@ class CompletionResponseChoice(BaseModel):
369369
draft_logprobs: Optional[CompletionLogprobs] = None
370370
prompt_logprobs: Optional[PromptLogprobs] = None
371371
reasoning_content: Optional[str] = None
372-
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] = None
372+
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop", "abort"]] = None
373373
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
374374
speculate_metrics: Optional[SpeculateMetrics] = None
375375

@@ -415,7 +415,7 @@ class CompletionResponseStreamChoice(BaseModel):
415415
prompt_tokens: Optional[str] = None
416416
completion_tokens: Optional[str] = None
417417
reasoning_content: Optional[str] = None
418-
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] = None
418+
finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop", "abort"]] = None
419419
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
420420
speculate_metrics: Optional[SpeculateMetrics] = None
421421

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,9 @@ async def chat_completion_stream_generator(
463463
if res.get("error_msg") is not None and "Recover" in res["error_msg"]:
464464
choice.finish_reason = "recover_stop"
465465

466+
if res.get("error_msg") is not None and "Aborted" in res["error_msg"]:
467+
choice.finish_reason = "abort"
468+
466469
inference_start_time[idx] = 0
467470

468471
if request.collect_metrics:
@@ -795,6 +798,8 @@ async def _create_chat_completion_choice(
795798
if data.get("error_msg", None) is not None and "Recover" in data["error_msg"]:
796799
finish_reason = "recover_stop"
797800

801+
if data.get("error_msg", None) is not None and "Aborted" in data["error_msg"]:
802+
finish_reason = "abort"
798803
return ChatCompletionResponseChoice(
799804
index=idx,
800805
message=message,

fastdeploy/entrypoints/openai/serving_completion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,8 @@ async def completion_stream_generator(
582582
output,
583583
tool_called[idx],
584584
)
585+
if res.get("error_msg") is not None and "Aborted" in res["error_msg"]:
586+
choices[-1].finish_reason = "abort"
585587
inference_start_time[idx] = 0
586588

587589
send_idx = output.get("send_idx")
@@ -724,6 +726,8 @@ def request_output_to_completion_response(
724726
output,
725727
False,
726728
)
729+
if final_res.get("error_msg", None) is not None and "Aborted" in final_res["error_msg"]:
730+
finish_reason = "abort"
727731

728732
choice_data = CompletionResponseChoice(
729733
token_ids=token_ids,

0 commit comments

Comments
 (0)