Skip to content

Commit 454f664

Browse files
unamedkrclaude
andauthored
perf(wasm): enable pthreads multi-threading for 3-4x speedup (#27)
Enable WASM pthreads so inference uses multiple CPU cores in the browser. Three changes: 1. coi-serviceworker.js: injects Cross-Origin-Opener-Policy and Cross-Origin-Embedder-Policy headers into all responses via Service Worker. This enables SharedArrayBuffer on GitHub Pages and other static hosts that don't support custom HTTP headers. Well-established pattern (used by FFmpeg.wasm, SQL.js, etc.). 2. build.sh: add -pthread, PTHREAD_POOL_SIZE=4, ENVIRONMENT=web,worker. WASM binary now includes multi-threaded libc and pthread support. 3. quant_wasm.c: detect navigator.hardwareConcurrency (capped at 4) and pass to quant_config.n_threads. Model load message shows thread count ("Model loaded! Ready to chat. (4 threads)"). Expected speedup: 3-4x on multi-core devices (most modern laptops). Combined with SIMD128 from PR #25: total 6-12x vs original build. Binary: 320K → 384K (pthread runtime overhead). Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 163affe commit 454f664

6 files changed

Lines changed: 97 additions & 12 deletions

File tree

wasm/build.sh

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
#!/bin/bash
2-
# Build quant.cpp WASM demo
2+
# Build quant.cpp WASM demo (multi-threaded + SIMD)
33
# Requires: Emscripten SDK (emcc)
44
#
55
# Usage: cd wasm && bash build.sh
66
# Then: python3 -m http.server 8080
77
# Open: http://localhost:8080
8+
#
9+
# Multi-threading requires Cross-Origin-Isolation headers.
10+
# coi-serviceworker.js injects them on GitHub Pages / static hosts.
811

912
set -e
1013

1114
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
1215
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
1316

14-
echo "=== Building quant.cpp WASM ==="
17+
echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
1518

1619
# Check emcc
1720
if ! command -v emcc &>/dev/null; then
@@ -23,13 +26,14 @@ fi
2326

2427
echo "emcc version: $(emcc --version | head -1)"
2528

26-
# Build
29+
# Build with pthreads + SIMD128 + ASYNCIFY
2730
emcc "$SCRIPT_DIR/quant_wasm.c" \
2831
-I"$PROJECT_DIR" \
2932
-o "$SCRIPT_DIR/quant.js" \
3033
-O3 \
3134
-msimd128 \
3235
-flto \
36+
-pthread \
3337
-s WASM=1 \
3438
-s ALLOW_MEMORY_GROWTH=1 \
3539
-s MAXIMUM_MEMORY=4GB \
@@ -38,13 +42,15 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
3842
-s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
3943
-s FORCE_FILESYSTEM=1 \
4044
-s MODULARIZE=0 \
41-
-s ENVIRONMENT=web \
45+
-s ENVIRONMENT='web,worker' \
4246
-s NO_EXIT_RUNTIME=1 \
4347
-s ASSERTIONS=0 \
4448
-s STACK_SIZE=1MB \
4549
-s ASYNCIFY \
4650
-s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
4751
-s ASYNCIFY_STACK_SIZE=65536 \
52+
-s PTHREAD_POOL_SIZE=4 \
53+
-s PTHREAD_POOL_SIZE_STRICT=0 \
4854
-lm \
4955
-DNDEBUG \
5056
-D__EMSCRIPTEN__ \
@@ -53,11 +59,14 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
5359

5460
echo ""
5561
echo "=== Build complete ==="
56-
echo "Files: quant.js ($(du -h "$SCRIPT_DIR/quant.js" | cut -f1)), quant.wasm ($(du -h "$SCRIPT_DIR/quant.wasm" | cut -f1))"
62+
echo "Files:"
63+
for f in quant.js quant.wasm quant.worker.js; do
64+
[ -f "$SCRIPT_DIR/$f" ] && echo " $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
65+
done
5766
echo ""
5867
echo "To serve locally:"
5968
echo " cd $SCRIPT_DIR && python3 -m http.server 8080"
6069
echo " Open http://localhost:8080"
6170
echo ""
62-
echo "For HTTPS (required for SharedArrayBuffer):"
63-
echo " npx serve -s $SCRIPT_DIR --ssl-cert cert.pem --ssl-key key.pem"
71+
echo "Note: Multi-threading requires Cross-Origin-Isolation."
72+
echo "coi-serviceworker.js handles this automatically on GitHub Pages."

wasm/coi-serviceworker.js

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*! coi-serviceworker v0.1.7 - Guido Zuidhof, licensed under MIT */
2+
/*
3+
* Service Worker that injects Cross-Origin-Opener-Policy and
4+
* Cross-Origin-Embedder-Policy headers into all responses.
5+
* This enables SharedArrayBuffer on hosts that don't support
6+
* custom HTTP headers (e.g., GitHub Pages).
7+
*
8+
* Required for WASM pthreads (multi-threaded inference).
9+
*/
10+
if (typeof window === 'undefined') {
11+
// Service Worker scope
12+
self.addEventListener("install", () => self.skipWaiting());
13+
self.addEventListener("activate", (e) => e.waitUntil(self.clients.claim()));
14+
15+
self.addEventListener("fetch", (e) => {
16+
// Only intercept same-origin or navigation requests
17+
if (
18+
e.request.cache === "only-if-cached" &&
19+
e.request.mode !== "same-origin"
20+
) {
21+
return;
22+
}
23+
24+
e.respondWith(
25+
fetch(e.request).then((response) => {
26+
// Can't modify opaque responses
27+
if (response.status === 0) return response;
28+
29+
const newHeaders = new Headers(response.headers);
30+
newHeaders.set("Cross-Origin-Embedder-Policy", "credentialless");
31+
newHeaders.set("Cross-Origin-Opener-Policy", "same-origin");
32+
33+
return new Response(response.body, {
34+
status: response.status,
35+
statusText: response.statusText,
36+
headers: newHeaders,
37+
});
38+
}).catch((err) => {
39+
console.error("coi-serviceworker fetch error:", err);
40+
return new Response("Service Worker fetch error", { status: 500 });
41+
})
42+
);
43+
});
44+
} else {
45+
// Window scope — register the service worker
46+
(async () => {
47+
if (!window.crossOriginIsolated) {
48+
const reg = await navigator.serviceWorker.register(
49+
window.document.currentScript.src
50+
);
51+
if (reg.active && !navigator.serviceWorker.controller) {
52+
// Service worker installed but not controlling — reload to activate
53+
window.location.reload();
54+
} else if (!reg.active) {
55+
// Wait for the service worker to activate, then reload
56+
const sw = reg.installing || reg.waiting;
57+
sw.addEventListener("statechange", () => {
58+
if (sw.state === "activated") window.location.reload();
59+
});
60+
}
61+
}
62+
})();
63+
}

wasm/index.html

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
<meta http-equiv="Cross-Origin-Opener-Policy" content="same-origin">
88
<meta http-equiv="Cross-Origin-Embedder-Policy" content="require-corp">
99
<title>quant.cpp — LLM in Your Browser</title>
10+
<!-- Service Worker for COOP/COEP headers — enables SharedArrayBuffer + pthreads on GitHub Pages -->
11+
<script src="coi-serviceworker.js"></script>
1012
<style>
1113
* { margin: 0; padding: 0; box-sizing: border-box; }
1214
body {

wasm/quant.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wasm/quant.wasm

42.9 KB
Binary file not shown.

wasm/quant_wasm.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ static quant_ctx* g_ctx = NULL;
2323
static char g_output[65536];
2424
static int g_output_pos = 0;
2525
static int g_generating = 0;
26+
static int g_wasm_threads = 1;
27+
28+
/* Query thread count from JS navigator.hardwareConcurrency */
29+
EM_JS(int, js_get_hw_concurrency, (void), {
30+
return Math.min(navigator.hardwareConcurrency || 1, 4);
31+
});
2632

2733
/* JS callback: called for each generated token */
2834
EM_JS(void, js_on_token, (const char* text), {
@@ -86,11 +92,13 @@ int wasm_load_model(const char* path) {
8692
return -1;
8793
}
8894

95+
g_wasm_threads = js_get_hw_concurrency();
96+
8997
quant_config cfg = {
9098
.temperature = 0.7f,
9199
.top_p = 0.9f,
92100
.max_tokens = 512,
93-
.n_threads = 1, /* WASM: single thread for compatibility */
101+
.n_threads = g_wasm_threads,
94102
.kv_compress = 1, /* 4-bit KV compression */
95103
};
96104
g_ctx = quant_new(g_model, &cfg);
@@ -99,7 +107,10 @@ int wasm_load_model(const char* path) {
99107
return -1;
100108
}
101109

102-
js_on_status("Model loaded! Ready to chat.");
110+
char status_msg[128];
111+
snprintf(status_msg, sizeof(status_msg),
112+
"Model loaded! Ready to chat. (%d threads)", g_wasm_threads);
113+
js_on_status(status_msg);
103114
return 0;
104115
}
105116

@@ -124,7 +135,7 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
124135
.temperature = temperature,
125136
.top_p = 0.9f,
126137
.max_tokens = max_tokens > 0 ? max_tokens : 256,
127-
.n_threads = 1,
138+
.n_threads = g_wasm_threads,
128139
.kv_compress = 1,
129140
};
130141

@@ -170,7 +181,7 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
170181
.temperature = temperature,
171182
.top_p = 0.9f,
172183
.max_tokens = max_tokens > 0 ? max_tokens : 256,
173-
.n_threads = 1,
184+
.n_threads = g_wasm_threads,
174185
.kv_compress = 1,
175186
};
176187

0 commit comments

Comments
 (0)