|
| 1 | +#!/usr/bin/env bash |
| 2 | +# server_example.sh -- Start and test the quant.cpp OpenAI-compatible server |
| 3 | +# |
| 4 | +# Usage: |
| 5 | +# ./examples/server_example.sh <model.gguf> |
| 6 | +# |
| 7 | +# Prerequisites: |
| 8 | +# cmake -B build -DCMAKE_BUILD_TYPE=Release -DTQ_BUILD_SERVER=ON |
| 9 | +# cmake --build build -j$(nproc) |
| 10 | + |
| 11 | +set -euo pipefail |
| 12 | + |
| 13 | +MODEL="${1:-}" |
| 14 | +PORT="${2:-8080}" |
| 15 | + |
| 16 | +if [ -z "$MODEL" ]; then |
| 17 | + echo "Usage: $0 <model.gguf> [port]" |
| 18 | + echo "" |
| 19 | + echo "Example:" |
| 20 | + echo " $0 Qwen2.5-0.5B-Instruct.gguf 8080" |
| 21 | + exit 1 |
| 22 | +fi |
| 23 | + |
| 24 | +SERVER="./build/quant-server" |
| 25 | +if [ ! -f "$SERVER" ]; then |
| 26 | + echo "Error: quant-server not found. Build with:" |
| 27 | + echo " cmake -B build -DCMAKE_BUILD_TYPE=Release -DTQ_BUILD_SERVER=ON" |
| 28 | + echo " cmake --build build -j\$(nproc)" |
| 29 | + exit 1 |
| 30 | +fi |
| 31 | + |
| 32 | +echo "=== Starting quant.cpp server ===" |
| 33 | +echo "Model: $MODEL" |
| 34 | +echo "Port: $PORT" |
| 35 | +echo "" |
| 36 | + |
| 37 | +# Start server in background |
| 38 | +$SERVER "$MODEL" -p "$PORT" -j 4 -k uniform_4b & |
| 39 | +SERVER_PID=$! |
| 40 | + |
| 41 | +# Wait for server to start |
| 42 | +echo "Waiting for server to start..." |
| 43 | +for i in $(seq 1 30); do |
| 44 | + if curl -s "http://localhost:$PORT/health" > /dev/null 2>&1; then |
| 45 | + echo "Server is ready!" |
| 46 | + break |
| 47 | + fi |
| 48 | + sleep 0.5 |
| 49 | +done |
| 50 | + |
| 51 | +echo "" |
| 52 | +echo "=== Test 1: Health check ===" |
| 53 | +curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || \ |
| 54 | + curl -s "http://localhost:$PORT/health" |
| 55 | +echo "" |
| 56 | + |
| 57 | +echo "" |
| 58 | +echo "=== Test 2: List models ===" |
| 59 | +curl -s "http://localhost:$PORT/v1/models" | python3 -m json.tool 2>/dev/null || \ |
| 60 | + curl -s "http://localhost:$PORT/v1/models" |
| 61 | +echo "" |
| 62 | + |
| 63 | +echo "" |
| 64 | +echo "=== Test 3: Chat completion (non-streaming) ===" |
| 65 | +curl -s "http://localhost:$PORT/v1/chat/completions" \ |
| 66 | + -H "Content-Type: application/json" \ |
| 67 | + -d '{ |
| 68 | + "model": "default", |
| 69 | + "messages": [ |
| 70 | + {"role": "system", "content": "You are a helpful assistant."}, |
| 71 | + {"role": "user", "content": "What is 2+2? Answer in one word."} |
| 72 | + ], |
| 73 | + "max_tokens": 32, |
| 74 | + "temperature": 0.1 |
| 75 | + }' | python3 -m json.tool 2>/dev/null || echo "(raw output above)" |
| 76 | +echo "" |
| 77 | + |
| 78 | +echo "" |
| 79 | +echo "=== Test 4: Chat completion (streaming) ===" |
| 80 | +curl -s -N "http://localhost:$PORT/v1/chat/completions" \ |
| 81 | + -H "Content-Type: application/json" \ |
| 82 | + -d '{ |
| 83 | + "model": "default", |
| 84 | + "messages": [ |
| 85 | + {"role": "user", "content": "Say hello in 3 words."} |
| 86 | + ], |
| 87 | + "max_tokens": 32, |
| 88 | + "temperature": 0.7, |
| 89 | + "stream": true |
| 90 | + }' |
| 91 | +echo "" |
| 92 | + |
| 93 | +echo "" |
| 94 | +echo "=== Test 5: Chat with KV compression options ===" |
| 95 | +curl -s "http://localhost:$PORT/v1/chat/completions" \ |
| 96 | + -H "Content-Type: application/json" \ |
| 97 | + -d '{ |
| 98 | + "model": "default", |
| 99 | + "messages": [ |
| 100 | + {"role": "user", "content": "Hello!"} |
| 101 | + ], |
| 102 | + "max_tokens": 16, |
| 103 | + "kv_type": "turbo_kv_3b", |
| 104 | + "value_quant_bits": 4, |
| 105 | + "delta_kv": true |
| 106 | + }' | python3 -m json.tool 2>/dev/null || echo "(raw output above)" |
| 107 | +echo "" |
| 108 | + |
| 109 | +echo "" |
| 110 | +echo "=== Test 6: OpenAI Python SDK compatibility ===" |
| 111 | +cat <<'PYTHON' |
| 112 | +# You can also use the official OpenAI Python SDK: |
| 113 | +# |
| 114 | +# pip install openai |
| 115 | +# |
| 116 | +# from openai import OpenAI |
| 117 | +# client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed") |
| 118 | +# |
| 119 | +# response = client.chat.completions.create( |
| 120 | +# model="default", |
| 121 | +# messages=[{"role": "user", "content": "Hello!"}], |
| 122 | +# max_tokens=64, |
| 123 | +# stream=True, |
| 124 | +# ) |
| 125 | +# for chunk in response: |
| 126 | +# if chunk.choices[0].delta.content: |
| 127 | +# print(chunk.choices[0].delta.content, end="", flush=True) |
| 128 | +PYTHON |
| 129 | + |
| 130 | +# Cleanup |
| 131 | +echo "" |
| 132 | +echo "Stopping server (PID $SERVER_PID)..." |
| 133 | +kill "$SERVER_PID" 2>/dev/null || true |
| 134 | +wait "$SERVER_PID" 2>/dev/null || true |
| 135 | +echo "Done." |
0 commit comments