Skip to content

Commit 30c8cdc

Browse files
authored
Merge pull request #2525 from kevincheng2/develop
[LLM] support openai client
2 parents 798229c + 6ee776a commit 30c8cdc

10 files changed

Lines changed: 275 additions & 33 deletions

File tree

llm/dockerfiles/Dockerfile_serving_cuda118_cudnn8

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ WORKDIR /opt/output/
44
COPY ./server/ /opt/output/Serving/
55
COPY ./client/ /opt/output/client/
66

7-
ENV LD_LIBRARY_PATH "/usr/local/cuda-11.8/compat/:$LD_LIBRARY_PATH"
7+
ENV LD_LIBRARY_PATH="/usr/local/cuda-11.8/compat/:$LD_LIBRARY_PATH"
88

99
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
1010
RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/ \
@@ -15,7 +15,7 @@ RUN git clone https://gitee.com/paddlepaddle/PaddleNLP.git && cd PaddleNLP/csrc
1515
&& python3 setup_cuda.py build && python3 setup_cuda.py install --user \
1616
&& cp -r /opt/output/PaddleNLP/paddlenlp /usr/local/lib/python3.10/dist-packages/ \
1717
&& cp -r /root/.local/lib/python3.10/site-packages/* /usr/local/lib/python3.10/dist-packages/ \
18-
&& rm -rf PaddleNLP
18+
&& rm -rf /opt/output/PaddleNLP
1919

2020
RUN cd /opt/output/client && pip install -r requirements.txt && pip install .
2121

@@ -30,7 +30,5 @@ RUN cd /opt/output/Serving/ \
3030
&& cp scripts/start_server.sh . && cp scripts/stop_server.sh . \
3131
&& rm -rf scripts
3232

33-
RUN python3 -m pip install protobuf==3.20.0
34-
35-
ENV http_proxy ""
36-
ENV https_proxy ""
33+
ENV http_proxy=""
34+
ENV https_proxy=""

llm/dockerfiles/Dockerfile_serving_cuda123_cudnn9

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ WORKDIR /opt/output/
44
COPY ./server/ /opt/output/Serving/
55
COPY ./client/ /opt/output/client/
66

7-
ENV LD_LIBRARY_PATH "/usr/local/cuda-12.3/compat/:$LD_LIBRARY_PATH"
7+
ENV LD_LIBRARY_PATH="/usr/local/cuda-12.3/compat/:$LD_LIBRARY_PATH"
88

99
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
1010
RUN python3 -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/ \
@@ -15,7 +15,7 @@ RUN git clone https://gitee.com/paddlepaddle/PaddleNLP.git && cd PaddleNLP/csrc
1515
&& python3 setup_cuda.py build && python3 setup_cuda.py install --user \
1616
&& cp -r /opt/output/PaddleNLP/paddlenlp /usr/local/lib/python3.10/dist-packages/ \
1717
&& cp -r /root/.local/lib/python3.10/site-packages/* /usr/local/lib/python3.10/dist-packages/ \
18-
&& rm -rf PaddleNLP
18+
&& rm -rf /opt/output/PaddleNLP
1919

2020
RUN cd /opt/output/client && pip install -r requirements.txt && pip install .
2121

@@ -30,7 +30,5 @@ RUN cd /opt/output/Serving/ \
3030
&& cp scripts/start_server.sh . && cp scripts/stop_server.sh . \
3131
&& rm -rf scripts
3232

33-
RUN python3 -m pip install protobuf==3.20.0
34-
35-
ENV http_proxy ""
36-
ENV https_proxy ""
33+
ENV http_proxy=""
34+
ENV https_proxy=""

llm/docs/FastDeploy_usage_tutorial.md

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ ls /fastdeploy/models/
6666
git clone https://github.com/PaddlePaddle/FastDeploy.git
6767
cd FastDeploy/llm
6868
69-
docker build -f ./dockerfiles/Dockerfile_serving_cuda123_cudnn9 -t llm-serving-cu123-self .
69+
docker build --network=host -f ./dockerfiles/Dockerfile_serving_cuda123_cudnn9 -t llm-serving-cu123-self .
7070
```
7171

7272
创建自己的镜像后,可以基于该镜像[创建容器](#创建容器)
@@ -196,6 +196,77 @@ for line in res.iter_lines():
196196
如果异常,返回{'error_msg': xxx, 'error_code': xxx},error_msg字段不为空,error_code字段不为0
197197
```
198198

199+
### OpenAI 客户端
200+
201+
我们提供了 OpenAI 客户端的支持,使用方法如下:
202+
203+
提示:使用 OpenAI 客户端需要配置 `PUSH_MODE_HTTP_PORT`
204+
205+
```
206+
import openai
207+
208+
client = openai.Client(base_url="http://127.0.0.1:{PUSH_MODE_HTTP_PORT}/v1/chat/completions", api_key="EMPTY_API_KEY")
209+
210+
# 非流式返回
211+
response = client.completions.create(
212+
model="default",
213+
prompt="Hello, how are you?",
214+
max_tokens=50,
215+
stream=False,
216+
)
217+
218+
print(response)
219+
print("\n")
220+
221+
# 流式返回
222+
response = client.completions.create(
223+
model="default",
224+
prompt="Hello, how are you?",
225+
max_tokens=100,
226+
stream=True,
227+
)
228+
229+
for chunk in response:
230+
if chunk.choices[0] is not None:
231+
print(chunk.choices[0].text, end='')
232+
print("\n")
233+
234+
# Chat completion
235+
# 非流式返回
236+
response = client.chat.completions.create(
237+
model="default",
238+
messages=[
239+
{"role": "user", "content": "Hello, who are you"},
240+
{"role": "system", "content": "I'm a helpful AI assistant."},
241+
{"role": "user", "content": "List 3 countries and their capitals."},
242+
],
243+
temperature=0,
244+
max_tokens=64,
245+
stream=False,
246+
)
247+
248+
print(response)
249+
print("\n")
250+
251+
# 流式返回
252+
response = client.chat.completions.create(
253+
model="default",
254+
messages=[
255+
{"role": "user", "content": "Hello, who are you"},
256+
{"role": "system", "content": "I'm a helpful AI assistant."},
257+
{"role": "user", "content": "List 3 countries and their capitals."},
258+
],
259+
temperature=0,
260+
max_tokens=64,
261+
stream=True,
262+
)
263+
264+
for chunk in response:
265+
if chunk.choices[0].delta is not None:
266+
print(chunk.choices[0].delta.content, end='')
267+
print("\n")
268+
```
269+
199270
## 模型配置参数介绍
200271

201272
| 字段名 | 字段类型 | 说明 | 是否必填 | 默认值 | 备注 |

llm/server/requirements.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# model server
2-
paddlenlp==2.7.2
32
sentencepiece
43
pycryptodome
54
tritonclient[all]==2.41.1
@@ -10,7 +9,7 @@ transformers
109
# http server
1110
fastapi
1211
httpx
13-
openai==1.9.0
12+
openai==1.44.1
1413
asyncio
1514
uvicorn
1615
shortuuid
@@ -20,4 +19,3 @@ pynvml
2019

2120
# paddlenlp
2221
tiktoken
23-
transformers

llm/server/scripts/start_server.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ export PYTHONIOENCODING=utf8
66
export LC_ALL=C.UTF-8
77

88
# PaddlePaddle environment variables
9-
export FLAGS_allocator_strategy=naive_best_fit
10-
export FLAGS_fraction_of_gpu_memory_to_use=0.96
9+
export FLAGS_allocator_strategy=auto_growth
1110
export FLAGS_dynamic_static_unified_comm=0
1211
export FLAGS_use_xqa_optim=1
1312
export FLAGS_gemm_use_half_precision_compute_type=0

llm/server/server/checker.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ def check_basic_params(req_dict):
4040
error_msg.append("The `input_ids` in input parameters must be a list")
4141
if "messages" in req_dict:
4242
msg_len = len(req_dict["messages"])
43-
if msg_len % 2 == 0:
44-
error_msg.append(f"The number of the message {msg_len} must be odd")
4543
if not all("content" in item for item in req_dict["messages"]):
4644
error_msg.append("The item in messages must include `content`")
4745

llm/server/server/data/processor.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ def __init__(self):
125125

126126
self.decode_status = dict()
127127
self.tokenizer = self._load_tokenizer()
128-
data_processor_logger.info(f"tokenizer infomation: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, "+
129-
f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, ")
128+
data_processor_logger.info(f"tokenizer infomation: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
129+
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} ")
130130

131131
def process_request(self, request, max_seq_len=None):
132132
"""
@@ -143,14 +143,19 @@ def process_request(self, request, max_seq_len=None):
143143
request["eos_token_ids"] = []
144144
request["eos_token_ids"].extend(get_eos_token_id(self.tokenizer, self.config.generation_config))
145145

146-
if "input_ids" in request:
147-
input_ids = request["input_ids"]
148-
else:
149-
input_ids = self.text2ids(request['text'])
146+
if "input_ids" not in request or \
147+
(isinstance(request["input_ids"], (list, tuple)) and len(request["input_ids"]) == 0):
148+
if "text" in request:
149+
request["input_ids"] = self.text2ids(request["text"])
150+
elif "messages" in request:
151+
if self.tokenizer.chat_template is None:
152+
raise ValueError(f"This model does not support chat_template.")
153+
request["input_ids"] = self.messages2ids(request["messages"])
154+
else:
155+
raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
150156

151-
if max_seq_len is not None and len(input_ids) > max_seq_len:
152-
input_ids = input_ids[:max_seq_len-1]
153-
request["input_ids"] = input_ids
157+
if max_seq_len is not None and len(request["input_ids"]) > max_seq_len:
158+
request["input_ids"] = request["input_ids"][:max_seq_len-1]
154159
data_processor_logger.info(f"processed request: {request}")
155160
return request
156161

@@ -221,7 +226,8 @@ def messages2ids(self, messages):
221226
Returns:
222227
List[int]: ID sequences
223228
"""
224-
return
229+
message_result = self.tokenizer.apply_chat_template(messages, return_tensors="pd")
230+
return message_result["input_ids"][0]
225231

226232
def ids2tokens(self, token_id, task_id):
227233
"""
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import time
2+
import json
3+
import queue
4+
5+
import numpy as np
6+
from typing import Dict
7+
from datetime import datetime
8+
from functools import partial
9+
10+
import tritonclient.grpc as grpcclient
11+
from tritonclient import utils as triton_utils
12+
from openai.types.completion_usage import CompletionUsage
13+
from openai.types.completion_choice import CompletionChoice
14+
from openai.types.completion import Completion
15+
from openai.types.chat.chat_completion_chunk import (
16+
ChoiceDelta,
17+
ChatCompletionChunk,
18+
Choice as ChatCompletionChoice
19+
)
20+
21+
from server.http_server.api import Req, chat_completion_generator
22+
from server.utils import http_server_logger
23+
24+
25+
def format_openai_message_completions(req: Req, result: Dict) -> Completion:
26+
choice_data = CompletionChoice(
27+
index=0,
28+
text=result['token'],
29+
finish_reason=result.get("finish_reason", "stop"),
30+
)
31+
chunk = Completion(
32+
id=req.req_id,
33+
choices=[choice_data],
34+
model=req.model,
35+
created=int(time.time()),
36+
object="text_completion",
37+
usage=CompletionUsage(
38+
completion_tokens=result["usage"]["completion_tokens"],
39+
prompt_tokens=result["usage"]["prompt_tokens"],
40+
total_tokens=result["usage"]["prompt_tokens"] + result["usage"]["completion_tokens"],
41+
),
42+
)
43+
return chunk.model_dump_json(exclude_unset=True)
44+
45+
46+
def format_openai_message_chat_completions(req: Req, result: Dict) -> ChatCompletionChunk:
47+
choice_data = ChatCompletionChoice(
48+
index=0,
49+
delta=ChoiceDelta(
50+
content=result['token'],
51+
role="assistant",
52+
),
53+
finish_reason=result.get("finish_reason", "stop"),
54+
)
55+
chunk = ChatCompletionChunk(
56+
id=req.req_id,
57+
choices=[choice_data],
58+
model=req.model,
59+
created=int(time.time()),
60+
object="chat.completion.chunk",
61+
usage=CompletionUsage(
62+
completion_tokens=result["usage"]["completion_tokens"],
63+
prompt_tokens=result["usage"]["prompt_tokens"],
64+
total_tokens=result["usage"]["prompt_tokens"] + result["usage"]["completion_tokens"],
65+
),
66+
)
67+
return chunk.model_dump_json(exclude_unset=True)
68+
69+
70+
def openai_chat_commpletion_generator(infer_grpc_url: str, req: Req, chat_interface: bool) -> Dict:
71+
72+
def _openai_format_resp(resp_dict):
73+
return f"data: {resp_dict}\n\n"
74+
75+
for resp in chat_completion_generator(infer_grpc_url, req, yield_json=False):
76+
if resp.get("is_end") == 1:
77+
yield _openai_format_resp("[DONE]")
78+
79+
if chat_interface:
80+
yield _openai_format_resp(format_openai_message_chat_completions(req, resp))
81+
else:
82+
yield _openai_format_resp(format_openai_message_completions(req, resp))
83+
84+
85+
def openai_chat_completion_result(infer_grpc_url: str, req: Req, chat_interface: bool):
86+
result = ""
87+
error_resp = None
88+
for resp in chat_completion_generator(infer_grpc_url, req, yield_json=False):
89+
if resp.get("error_msg") or resp.get("error_code"):
90+
error_resp = resp
91+
error_resp["result"] = ""
92+
else:
93+
result += resp.get("token")
94+
usage = resp.get("usage", None)
95+
96+
if error_resp:
97+
return error_resp
98+
response = {'token': result, 'error_msg': '', 'error_code': 0, 'usage': usage}
99+
100+
if chat_interface:
101+
return format_openai_message_chat_completions(req, response)
102+
else:
103+
return format_openai_message_completions(req, response)

llm/server/server/http_server/api.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import queue
1717
import time
1818
import uuid
19+
import shortuuid
1920
from datetime import datetime
2021
from functools import partial
2122
from typing import Dict, List, Optional
@@ -46,6 +47,7 @@ class Req(BaseModel):
4647
return_usage: Optional[bool] = False
4748
stream: bool = False
4849
timeout: int = 300
50+
model: str = None
4951

5052
def to_dict_for_infer(self):
5153
"""
@@ -54,14 +56,37 @@ def to_dict_for_infer(self):
5456
Returns:
5557
dict: request parameters in dict format
5658
"""
57-
self.compatible_with_OpenAI()
58-
5959
req_dict = {}
6060
for key, value in self.dict().items():
6161
if value is not None:
6262
req_dict[key] = value
6363
return req_dict
6464

65+
def load_openai_request(self, request_dict: dict):
66+
"""
67+
Convert openai request to Req
68+
official OpenAI API documentation: https://platform.openai.com/docs/api-reference/completions/create
69+
"""
70+
convert_dict = {
71+
"text": "prompt",
72+
"frequency_score": "frequency_penalty",
73+
"max_dec_len": "max_tokens",
74+
"stream": "stream",
75+
"return_all_tokens": "best_of",
76+
"temperature": "temperature",
77+
"topp": "top_p",
78+
"presence_score": "presence_penalty",
79+
"eos_token_ids": "stop",
80+
"req_id": "id",
81+
"model": "model",
82+
"messages": "messages",
83+
}
84+
85+
self.__setattr__("req_id", f"chatcmpl-{shortuuid.random()}")
86+
for key, value in convert_dict.items():
87+
if request_dict.get(value, None):
88+
self.__setattr__(key, request_dict.get(value))
89+
6590

6691
def chat_completion_generator(infer_grpc_url: str, req: Req, yield_json: bool) -> Dict:
6792
"""

0 commit comments

Comments
 (0)