Skip to content

Commit 4ad61ef

Browse files
unamedkrclaude
andcommitted
Add OpenAI-compatible HTTP server (/v1/chat/completions)
Pure C HTTP server with streaming SSE support, zero external deps. Endpoints: POST /v1/chat/completions, GET /v1/models, GET /health. KV compression configurable per-request. Build with -DTQ_BUILD_SERVER=ON. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dd612c4 commit 4ad61ef

4 files changed

Lines changed: 1477 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ option(TQ_BUILD_CUDA "Build CUDA backend" OFF)
1010
option(TQ_BUILD_METAL "Build Metal backend" OFF)
1111
option(TQ_BUILD_VULKAN "Build Vulkan backend" OFF)
1212
option(TQ_BUILD_ROCM "Build ROCm/HIP backend" OFF)
13+
option(TQ_BUILD_SERVER "Build OpenAI-compatible HTTP server" OFF)
1314

1415
# Threads (pthread)
1516
find_package(Threads REQUIRED)
@@ -276,3 +277,19 @@ foreach(ex_src ${EXAMPLE_C_SOURCES} ${EXAMPLE_CXX_SOURCES})
276277
add_executable(${ex_name} ${ex_src})
277278
target_link_libraries(${ex_name} turboquant)
278279
endforeach()
280+
281+
# OpenAI-compatible HTTP server
282+
if(TQ_BUILD_SERVER)
283+
add_executable(quant-server src/server/tq_server.c)
284+
target_include_directories(quant-server PRIVATE
285+
${CMAKE_SOURCE_DIR}/src/server
286+
${CMAKE_SOURCE_DIR}/include)
287+
target_compile_definitions(quant-server PRIVATE TQ_SERVER_MAIN)
288+
target_link_libraries(quant-server turboquant Threads::Threads)
289+
if(NOT MSVC)
290+
target_link_libraries(quant-server m)
291+
endif()
292+
target_compile_options(quant-server PRIVATE
293+
-Wall -Wextra -Wpedantic -Wno-unused-parameter)
294+
message(STATUS "quant.cpp: HTTP server target enabled (quant-server)")
295+
endif()

examples/server_example.sh

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env bash
2+
# server_example.sh -- Start and test the quant.cpp OpenAI-compatible server
3+
#
4+
# Usage:
5+
# ./examples/server_example.sh <model.gguf>
6+
#
7+
# Prerequisites:
8+
# cmake -B build -DCMAKE_BUILD_TYPE=Release -DTQ_BUILD_SERVER=ON
9+
# cmake --build build -j$(nproc)
10+
11+
set -euo pipefail
12+
13+
MODEL="${1:-}"
14+
PORT="${2:-8080}"
15+
16+
if [ -z "$MODEL" ]; then
17+
echo "Usage: $0 <model.gguf> [port]"
18+
echo ""
19+
echo "Example:"
20+
echo " $0 Qwen2.5-0.5B-Instruct.gguf 8080"
21+
exit 1
22+
fi
23+
24+
SERVER="./build/quant-server"
25+
if [ ! -f "$SERVER" ]; then
26+
echo "Error: quant-server not found. Build with:"
27+
echo " cmake -B build -DCMAKE_BUILD_TYPE=Release -DTQ_BUILD_SERVER=ON"
28+
echo " cmake --build build -j\$(nproc)"
29+
exit 1
30+
fi
31+
32+
echo "=== Starting quant.cpp server ==="
33+
echo "Model: $MODEL"
34+
echo "Port: $PORT"
35+
echo ""
36+
37+
# Start server in background
38+
$SERVER "$MODEL" -p "$PORT" -j 4 -k uniform_4b &
39+
SERVER_PID=$!
40+
41+
# Wait for server to start
42+
echo "Waiting for server to start..."
43+
for i in $(seq 1 30); do
44+
if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then
45+
echo "Server is ready!"
46+
break
47+
fi
48+
sleep 0.5
49+
done
50+
51+
echo ""
52+
echo "=== Test 1: Health check ==="
53+
curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || \
54+
curl -s "http://localhost:$PORT/health"
55+
echo ""
56+
57+
echo ""
58+
echo "=== Test 2: List models ==="
59+
curl -s "http://localhost:$PORT/v1/models" | python3 -m json.tool 2>/dev/null || \
60+
curl -s "http://localhost:$PORT/v1/models"
61+
echo ""
62+
63+
echo ""
64+
echo "=== Test 3: Chat completion (non-streaming) ==="
65+
curl -s "http://localhost:$PORT/v1/chat/completions" \
66+
-H "Content-Type: application/json" \
67+
-d '{
68+
"model": "default",
69+
"messages": [
70+
{"role": "system", "content": "You are a helpful assistant."},
71+
{"role": "user", "content": "What is 2+2? Answer in one word."}
72+
],
73+
"max_tokens": 32,
74+
"temperature": 0.1
75+
}' | python3 -m json.tool 2>/dev/null || echo "(raw output above)"
76+
echo ""
77+
78+
echo ""
79+
echo "=== Test 4: Chat completion (streaming) ==="
80+
curl -s -N "http://localhost:$PORT/v1/chat/completions" \
81+
-H "Content-Type: application/json" \
82+
-d '{
83+
"model": "default",
84+
"messages": [
85+
{"role": "user", "content": "Say hello in 3 words."}
86+
],
87+
"max_tokens": 32,
88+
"temperature": 0.7,
89+
"stream": true
90+
}'
91+
echo ""
92+
93+
echo ""
94+
echo "=== Test 5: Chat with KV compression options ==="
95+
curl -s "http://localhost:$PORT/v1/chat/completions" \
96+
-H "Content-Type: application/json" \
97+
-d '{
98+
"model": "default",
99+
"messages": [
100+
{"role": "user", "content": "Hello!"}
101+
],
102+
"max_tokens": 16,
103+
"kv_type": "turbo_kv_3b",
104+
"value_quant_bits": 4,
105+
"delta_kv": true
106+
}' | python3 -m json.tool 2>/dev/null || echo "(raw output above)"
107+
echo ""
108+
109+
echo ""
110+
echo "=== Test 6: OpenAI Python SDK compatibility ==="
111+
cat <<'PYTHON'
112+
# You can also use the official OpenAI Python SDK:
113+
#
114+
# pip install openai
115+
#
116+
# from openai import OpenAI
117+
# client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
118+
#
119+
# response = client.chat.completions.create(
120+
# model="default",
121+
# messages=[{"role": "user", "content": "Hello!"}],
122+
# max_tokens=64,
123+
# stream=True,
124+
# )
125+
# for chunk in response:
126+
# if chunk.choices[0].delta.content:
127+
# print(chunk.choices[0].delta.content, end="", flush=True)
128+
PYTHON
129+
130+
# Cleanup
131+
echo ""
132+
echo "Stopping server (PID $SERVER_PID)..."
133+
kill "$SERVER_PID" 2>/dev/null || true
134+
wait "$SERVER_PID" 2>/dev/null || true
135+
echo "Done."

0 commit comments

Comments
 (0)