Add Docker support: multi-stage Alpine build (~10MB image)

unamedkr · claude · unamedkr · commit a8c373ce162b · 2026-04-05T08:46:07.000+09:00
Static-linked binary, zero runtime deps. Models mounted at /models.
Includes docker-compose.yml with KV compression config and usage docs.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.dockerignore b/.dockerignore
@@ -1,12 +1,35 @@
+# Build artifacts
 build/
-build-*/
+cmake-build-*/
+
+# Reference implementations (large, not needed in image)
+refs/
+
+# Git
 .git/
+.gitignore
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Docs and non-essential files
+docs/
+*.md
+LICENSE
+
+# Harness and CI
+harness/
 .claude/
-refs/
+.github/
+
+# Models (mounted at runtime, not baked into image)
 models/
 *.gguf
-*.tqm
 *.safetensors
-__pycache__/
-*.pyc
-.venv/
+*.bin
+
+# WASM build
+wasm/
diff --git a/Dockerfile b/Dockerfile
@@ -1,31 +1,40 @@
-FROM ubuntu:22.04
+# quant.cpp — Multi-stage Docker build
+# Final image: Alpine + static binary (~10MB)
 
-# Avoid interactive prompts during package installation
-ENV DEBIAN_FRONTEND=noninteractive
+# ---- Build stage ----
+FROM alpine:3.20 AS builder
 
-# Install build dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cmake \
-        g++ \
-        make \
-        python3 \
-        python3-pip \
-    && rm -rf /var/lib/apt/lists/*
+RUN apk add --no-cache cmake gcc g++ musl-dev make linux-headers
 
-# Copy project source (see .dockerignore for exclusions)
-COPY . /quant
-WORKDIR /quant
+WORKDIR /src
+COPY . .
 
-# Build the library, tools, and tests
 RUN cmake -B build \
         -DCMAKE_BUILD_TYPE=Release \
-        -DTQ_BUILD_TESTS=ON \
-        -DTQ_BUILD_BENCH=ON \
-    && cmake --build build -j$(nproc)
+        -DCMAKE_C_FLAGS="-static" \
+        -DCMAKE_EXE_LINKER_FLAGS="-static" \
+        -DTQ_BUILD_TESTS=OFF \
+        -DTQ_BUILD_BENCH=OFF \
+    && cmake --build build -j$(nproc) --target quant
 
-# Run the test suite
-RUN ctest --test-dir build --output-on-failure
+# ---- Runtime stage ----
+FROM alpine:3.20
 
-# Default entrypoint: the quant inference CLI
-# Usage: docker run quant models/model.gguf -p "Hello"
-ENTRYPOINT ["./build/quant"]
+# Labels
+LABEL org.opencontainers.image.title="quant.cpp" \
+      org.opencontainers.image.description="LLM inference with 7x longer context — pure C, zero dependencies" \
+      org.opencontainers.image.source="https://github.com/quantumaikr/quant.cpp"
+
+# Copy only the binary
+COPY --from=builder /src/build/quant /usr/local/bin/quant
+
+# Create model mount point
+RUN mkdir -p /models
+
+# Future server mode
+EXPOSE 8080
+
+# Volume for GGUF model files
+VOLUME ["/models"]
+
+ENTRYPOINT ["quant"]
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,15 +1,25 @@
 services:
-  quant:
+  inference:
     build: .
+    image: quant.cpp:latest
     volumes:
-      - ./models:/quant/models
-    command: ["models/model.tqm", "-p", "Hello", "-k", "turbo_kv_1b"]
-
-  # Run with a custom prompt and KV type:
-  #   docker compose run quant models/model.tqm -p "Once upon a time" -k turbo_kv_3b -n 128
-  #
-  # Run perplexity evaluation:
-  #   docker compose run quant models/model.tqm --ppl models/test.txt -k turbo_kv_1b
-  #
-  # Show memory stats:
-  #   docker compose run quant models/model.tqm -p "Hello" -k turbo_kv_1b -M
+      - ./models:/models
+    environment:
+      # KV cache compression settings (passed as CLI args below)
+      - TQ_KV_TYPE=uniform_4b
+      - TQ_VALUE_QUANT=q4
+      - TQ_THREADS=4
+    ports:
+      - "8080:8080"
+    # Default: run model with KV compression
+    # Override command to change model path, prompt, or options
+    command:
+      - /models/model.gguf
+      - -k
+      - uniform_4b
+      - -v
+      - q4
+      - -j
+      - "4"
+      - -p
+      - "Hello, world"
diff --git a/docs/docker.md b/docs/docker.md
@@ -0,0 +1,82 @@
+# Docker Usage Guide
+
+quant.cpp ships as a minimal Docker image (~10MB) built on Alpine Linux.
+The binary is statically linked with zero runtime dependencies.
+
+## Quick Start
+
+### Build the image
+
+```bash
+docker build -t quant.cpp .
+```
+
+### Run inference
+
+Mount a directory containing your GGUF model file and pass CLI arguments:
+
+```bash
+docker run -v ./models:/models quant.cpp /models/model.gguf -p "hello" -k uniform_4b -v q4
+```
+
+### Full example with all options
+
+```bash
+docker run -v ./models:/models quant.cpp \
+    /models/model.gguf \
+    -p "Once upon a time" \
+    -n 512 \
+    -k turbo_3b \
+    -v q4 \
+    -j 4 \
+    -T 0.8
+```
+
+### Print model info
+
+```bash
+docker run -v ./models:/models quant.cpp /models/model.gguf --info
+```
+
+### Compute perplexity
+
+```bash
+docker run -v ./models:/models -v ./data:/data quant.cpp \
+    /models/model.gguf --ppl /data/wikitext.txt -k polar_3b -v q4
+```
+
+## Docker Compose
+
+The included `docker-compose.yml` provides a preconfigured inference service:
+
+```bash
+# Place your model at ./models/model.gguf, then:
+docker compose up
+
+# Override the prompt:
+docker compose run inference /models/model.gguf -p "Your prompt here" -k turbo_3b -v q4
+```
+
+Edit `docker-compose.yml` to change the default model path, KV compression type,
+or thread count.
+
+## KV Compression Options
+
+| Flag | Values | Description |
+|------|--------|-------------|
+| `-k` | `fp32`, `uniform_4b`, `uniform_2b`, `polar_3b`, `polar_4b`, `turbo_3b`, `turbo_4b` | Key cache quantization |
+| `-v` | `fp16`, `q4`, `q2` | Value cache quantization |
+| `-j` | integer | Thread count for matmul |
+
+## Volume Mounts
+
+Models are not baked into the image. Mount them at runtime:
+
+- `/models` -- default mount point for GGUF model files
+- Mount additional directories as needed (e.g., `/data` for perplexity evaluation)
+
+## Image Size
+
+The final image is approximately 10MB:
+- Alpine base: ~7MB
+- quant binary: ~500KB (statically linked, zero dependencies)