|
| 1 | +import time |
| 2 | +import json |
| 3 | +import queue |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +from typing import Dict |
| 7 | +from datetime import datetime |
| 8 | +from functools import partial |
| 9 | + |
| 10 | +import tritonclient.grpc as grpcclient |
| 11 | +from tritonclient import utils as triton_utils |
| 12 | +from openai.types.completion_usage import CompletionUsage |
| 13 | +from openai.types.completion_choice import CompletionChoice |
| 14 | +from openai.types.completion import Completion |
| 15 | +from openai.types.chat.chat_completion_chunk import ( |
| 16 | + ChoiceDelta, |
| 17 | + ChatCompletionChunk, |
| 18 | + Choice as ChatCompletionChoice |
| 19 | +) |
| 20 | + |
| 21 | +from server.http_server.api import Req, chat_completion_generator |
| 22 | +from server.utils import http_server_logger |
| 23 | + |
| 24 | + |
| 25 | +def format_openai_message_completions(req: Req, result: Dict) -> Completion: |
| 26 | + choice_data = CompletionChoice( |
| 27 | + index=0, |
| 28 | + text=result['token'], |
| 29 | + finish_reason=result.get("finish_reason", "stop"), |
| 30 | + ) |
| 31 | + chunk = Completion( |
| 32 | + id=req.req_id, |
| 33 | + choices=[choice_data], |
| 34 | + model=req.model, |
| 35 | + created=int(time.time()), |
| 36 | + object="text_completion", |
| 37 | + usage=CompletionUsage( |
| 38 | + completion_tokens=result["usage"]["completion_tokens"], |
| 39 | + prompt_tokens=result["usage"]["prompt_tokens"], |
| 40 | + total_tokens=result["usage"]["prompt_tokens"] + result["usage"]["completion_tokens"], |
| 41 | + ), |
| 42 | + ) |
| 43 | + return chunk.model_dump_json(exclude_unset=True) |
| 44 | + |
| 45 | + |
| 46 | +def format_openai_message_chat_completions(req: Req, result: Dict) -> ChatCompletionChunk: |
| 47 | + choice_data = ChatCompletionChoice( |
| 48 | + index=0, |
| 49 | + delta=ChoiceDelta( |
| 50 | + content=result['token'], |
| 51 | + role="assistant", |
| 52 | + ), |
| 53 | + finish_reason=result.get("finish_reason", "stop"), |
| 54 | + ) |
| 55 | + chunk = ChatCompletionChunk( |
| 56 | + id=req.req_id, |
| 57 | + choices=[choice_data], |
| 58 | + model=req.model, |
| 59 | + created=int(time.time()), |
| 60 | + object="chat.completion.chunk", |
| 61 | + usage=CompletionUsage( |
| 62 | + completion_tokens=result["usage"]["completion_tokens"], |
| 63 | + prompt_tokens=result["usage"]["prompt_tokens"], |
| 64 | + total_tokens=result["usage"]["prompt_tokens"] + result["usage"]["completion_tokens"], |
| 65 | + ), |
| 66 | + ) |
| 67 | + return chunk.model_dump_json(exclude_unset=True) |
| 68 | + |
| 69 | + |
| 70 | +def openai_chat_commpletion_generator(infer_grpc_url: str, req: Req, chat_interface: bool) -> Dict: |
| 71 | + |
| 72 | + def _openai_format_resp(resp_dict): |
| 73 | + return f"data: {resp_dict}\n\n" |
| 74 | + |
| 75 | + for resp in chat_completion_generator(infer_grpc_url, req, yield_json=False): |
| 76 | + if resp.get("is_end") == 1: |
| 77 | + yield _openai_format_resp("[DONE]") |
| 78 | + |
| 79 | + if chat_interface: |
| 80 | + yield _openai_format_resp(format_openai_message_chat_completions(req, resp)) |
| 81 | + else: |
| 82 | + yield _openai_format_resp(format_openai_message_completions(req, resp)) |
| 83 | + |
| 84 | + |
| 85 | +def openai_chat_completion_result(infer_grpc_url: str, req: Req, chat_interface: bool): |
| 86 | + result = "" |
| 87 | + error_resp = None |
| 88 | + for resp in chat_completion_generator(infer_grpc_url, req, yield_json=False): |
| 89 | + if resp.get("error_msg") or resp.get("error_code"): |
| 90 | + error_resp = resp |
| 91 | + error_resp["result"] = "" |
| 92 | + else: |
| 93 | + result += resp.get("token") |
| 94 | + usage = resp.get("usage", None) |
| 95 | + |
| 96 | + if error_resp: |
| 97 | + return error_resp |
| 98 | + response = {'token': result, 'error_msg': '', 'error_code': 0, 'usage': usage} |
| 99 | + |
| 100 | + if chat_interface: |
| 101 | + return format_openai_message_chat_completions(req, response) |
| 102 | + else: |
| 103 | + return format_openai_message_completions(req, response) |
0 commit comments