Skip to content

Commit 10c49ff

Browse files
unamedkrclaude
andauthored
feat(wasm): model selector (Qwen3 0.6B / Llama 3.2 1B) + real-time streaming (#20)
Replace the single SmolLM2-135M demo button with a two-card model selector: - Qwen3 0.6B Q4_K_M (~378 MB) — recommended default. Much better quality than 135M, multilingual, reasonable download size. - Llama 3.2 1B Q4_K_M (~770 MB) — "higher quality" option for users willing to wait. Each model has its own chat template (ChatML for Qwen, Llama 3 format for Llama) and IndexedDB cache key, so switching models doesn't evict the other from cache. Real-time streaming: - Add wasm_generate_async() in quant_wasm.c which calls emscripten_sleep(0) after each token, yielding control back to the browser event loop for DOM repaint. - Build with -sASYNCIFY + ASYNCIFY_IMPORTS=["emscripten_sleep"]. - JS generate() now awaits _wasm_generate_async() with fallback to sync _wasm_generate() for non-ASYNCIFY builds. - Live tok/s counter updates during generation. Also adds Qwen3-0.6B to the Python model registry. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dc89ab1 commit 10c49ff

4 files changed

Lines changed: 230 additions & 70 deletions

File tree

bindings/python/quantcpp/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@
5353
"smollm2-135m-instruct-q8_0.gguf",
5454
135,
5555
),
56+
"Qwen3-0.6B": (
57+
"unsloth/Qwen3-0.6B-GGUF",
58+
"Qwen3-0.6B-Q4_K_M.gguf",
59+
378,
60+
),
5661
"Llama-3.2-1B": (
5762
"hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF",
5863
"llama-3.2-1b-instruct-q4_k_m.gguf",

wasm/build.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,17 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
3232
-s ALLOW_MEMORY_GROWTH=1 \
3333
-s MAXIMUM_MEMORY=4GB \
3434
-s INITIAL_MEMORY=256MB \
35-
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
35+
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
3636
-s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
3737
-s FORCE_FILESYSTEM=1 \
3838
-s MODULARIZE=0 \
3939
-s ENVIRONMENT=web \
4040
-s NO_EXIT_RUNTIME=1 \
4141
-s ASSERTIONS=0 \
4242
-s STACK_SIZE=1MB \
43+
-s ASYNCIFY \
44+
-s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
45+
-s ASYNCIFY_STACK_SIZE=65536 \
4346
-lm \
4447
-DNDEBUG \
4548
-D__EMSCRIPTEN__ \

wasm/index.html

Lines changed: 150 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,33 @@
4343
.dropzone.loaded { border-color: #2a5a3a; background: #0d1a14; padding: 16px; }
4444
.dropzone.loaded h2 { font-size: 14px; color: #6ee7b7; }
4545

46+
/* Model selector */
47+
.model-cards {
48+
display: flex; gap: 12px; margin-bottom: 16px; justify-content: center; flex-wrap: wrap;
49+
}
50+
.model-card {
51+
padding: 14px 20px; border: 1px solid #333; border-radius: 10px;
52+
cursor: pointer; transition: all 0.2s; text-align: left; min-width: 220px;
53+
background: #111;
54+
}
55+
.model-card:hover { border-color: #6ee7b7; background: #0d1f17; }
56+
.model-card.recommended { border-color: #059669; }
57+
.model-card .name { font-weight: 600; font-size: 14px; margin-bottom: 4px; }
58+
.model-card .meta { font-size: 12px; color: #888; }
59+
.model-card .tag {
60+
display: inline-block; font-size: 10px; padding: 1px 6px; border-radius: 6px;
61+
background: #1a3a2a; color: #6ee7b7; margin-top: 6px;
62+
}
63+
.model-card .tag.blue { background: #1a2a3a; color: #7bb8f0; }
64+
4665
/* Chat */
4766
.chat { flex: 1; overflow-y: auto; margin-bottom: 16px; }
48-
.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; }
67+
.message { padding: 12px 16px; margin-bottom: 8px; border-radius: 8px; font-size: 14px; line-height: 1.6; white-space: pre-wrap; word-wrap: break-word; }
4968
.message.user { background: #1a1a2e; border: 1px solid #2a2a4e; }
5069
.message.assistant { background: #111; border: 1px solid #222; }
5170
.message.assistant .cursor { animation: blink 1s step-end infinite; }
5271
@keyframes blink { 50% { opacity: 0; } }
53-
.message.system { color: #666; font-size: 12px; text-align: center; }
72+
.message.system { color: #666; font-size: 12px; text-align: center; white-space: normal; }
5473
.message code { background: #1a1a1a; padding: 1px 4px; border-radius: 3px; font-size: 13px; }
5574
.message pre { background: #1a1a1a; padding: 12px; border-radius: 6px; overflow-x: auto; margin: 8px 0; }
5675
.message pre code { background: none; padding: 0; }
@@ -92,21 +111,32 @@
92111
<div class="header">
93112
<h1>quant<span>.cpp</span></h1>
94113
<span class="badge">WASM</span>
95-
<span class="badge" id="kvBadge" style="display:none">7x Context</span>
114+
<span class="badge" id="kvBadge" style="display:none">3x Context</span>
96115
<a class="github" href="https://github.com/quantumaikr/quant.cpp" target="_blank">GitHub ↗</a>
97116
</div>
98117

99118
<div class="main">
100119
<div class="dropzone" id="dropzone">
101-
<h2>LLM in Your Browser — 189 KB</h2>
120+
<h2>LLM in Your Browser</h2>
102121
<p style="margin-bottom:16px; color:#6ee7b7; font-size:15px">No install. No API key. No server. Just click.</p>
103-
<button id="demoBtn" onclick="loadDemoModel()" style="
104-
padding: 12px 32px; font-size: 16px; font-weight: 600;
105-
background: #059669; color: white; border: none; border-radius: 8px;
106-
cursor: pointer; margin-bottom: 12px;
107-
">▶ Try with SmolLM2-135M (~135 MB download)</button>
122+
123+
<div class="model-cards" id="modelCards">
124+
<div class="model-card recommended" onclick="loadDemoModel('qwen3-0.6b')">
125+
<div class="name">Qwen3 0.6B</div>
126+
<div class="meta">~378 MB download &middot; Q4_K_M</div>
127+
<span class="tag">Recommended</span>
128+
<div class="meta" style="margin-top:4px">Fast, multilingual, good for demo</div>
129+
</div>
130+
<div class="model-card" onclick="loadDemoModel('llama-3.2-1b')">
131+
<div class="name">Llama 3.2 1B</div>
132+
<div class="meta">~770 MB download &middot; Q4_K_M</div>
133+
<span class="tag blue">Higher quality</span>
134+
<div class="meta" style="margin-top:4px">Better reasoning, longer wait</div>
135+
</div>
136+
</div>
137+
108138
<p style="color:#555; font-size:13px">Or <a href="#" onclick="document.getElementById('fileInput').click(); return false" style="color:#6ee7b7">drop your own GGUF</a> file.</p>
109-
<p style="margin-top:8px; color:#333; font-size:12px">Runs entirely in your browser. Nothing uploaded to any server.</p>
139+
<p style="margin-top:8px; color:#333; font-size:12px">Runs entirely in your browser. Nothing uploaded.</p>
110140
<input type="file" id="fileInput" accept=".gguf" style="display:none">
111141
</div>
112142

@@ -135,15 +165,36 @@ <h2>LLM in Your Browser — 189 KB</h2>
135165
let modelLoaded = false;
136166
let generating = false;
137167

168+
// ---- Model registry ----
169+
const MODELS = {
170+
'qwen3-0.6b': {
171+
url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf',
172+
name: 'Qwen3-0.6B Q4_K_M',
173+
size: '~378 MB',
174+
cacheKey: 'qwen3-0.6b-q4km',
175+
chatTemplate: (text) => `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`,
176+
},
177+
'llama-3.2-1b': {
178+
url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
179+
name: 'Llama-3.2-1B-Instruct Q4_K_M',
180+
size: '~770 MB',
181+
cacheKey: 'llama-3.2-1b-q4km',
182+
chatTemplate: (text) => `<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n${text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`,
183+
},
184+
};
185+
let activeModelId = null;
186+
138187
// ---- IndexedDB model cache ----
139188
const DB_NAME = 'quantcpp_cache';
140189
const DB_STORE = 'models';
141-
const DEMO_KEY = 'smollm2-135m';
142190

143191
function openDB() {
144192
return new Promise((resolve, reject) => {
145-
const req = indexedDB.open(DB_NAME, 1);
146-
req.onupgradeneeded = () => req.result.createObjectStore(DB_STORE);
193+
const req = indexedDB.open(DB_NAME, 2);
194+
req.onupgradeneeded = () => {
195+
if (!req.result.objectStoreNames.contains(DB_STORE))
196+
req.result.createObjectStore(DB_STORE);
197+
};
147198
req.onsuccess = () => resolve(req.result);
148199
req.onerror = () => reject(req.error);
149200
});
@@ -199,27 +250,28 @@ <h2>LLM in Your Browser — 189 KB</h2>
199250
}
200251

201252
// Demo model — cache-first, download only if not in IndexedDB
202-
async function loadDemoModel() {
203-
const url = 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf';
204-
const btn = document.getElementById('demoBtn');
205-
btn.disabled = true;
253+
async function loadDemoModel(modelId) {
254+
const model = MODELS[modelId];
255+
if (!model) return;
256+
257+
activeModelId = modelId;
258+
const cards = document.querySelectorAll('.model-card');
259+
cards.forEach(c => c.style.pointerEvents = 'none');
206260

207261
try {
208262
// 1. Try cache first
209263
showLoading('Checking local cache...');
210-
const cached = await getCachedModel(DEMO_KEY);
264+
const cached = await getCachedModel(model.cacheKey);
211265
if (cached) {
212-
btn.textContent = 'Loading from cache...';
213-
showLoading('Loading cached model...');
214-
loadModelFromBytes(new Uint8Array(cached), 'smollm2-135m (cached)');
266+
showLoading(`Loading cached ${model.name}...`);
267+
loadModelFromBytes(new Uint8Array(cached), `${model.name} (cached)`);
215268
return;
216269
}
217270

218271
// 2. Download from HuggingFace
219-
btn.textContent = 'Downloading...';
220-
showLoading('Downloading SmolLM2-135M (~135 MB)...');
272+
showLoading(`Downloading ${model.name} (${model.size})...`);
221273

222-
const response = await fetch(url);
274+
const response = await fetch(model.url);
223275
if (!response.ok) throw new Error(`HTTP ${response.status}`);
224276

225277
const total = parseInt(response.headers.get('content-length') || '0');
@@ -237,7 +289,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
237289
const mb = (received / 1048576).toFixed(0);
238290
const totalMb = (total / 1048576).toFixed(0);
239291
document.getElementById('loadingText').textContent =
240-
`Downloading... ${pct}% (${mb}/${totalMb} MB)`;
292+
`Downloading ${model.name}... ${pct}% (${mb}/${totalMb} MB)`;
241293
}
242294
}
243295

@@ -247,26 +299,33 @@ <h2>LLM in Your Browser — 189 KB</h2>
247299

248300
// 3. Cache for next time
249301
showLoading('Caching model for instant reload...');
250-
await cacheModel(DEMO_KEY, arrayBuffer).catch(() => {});
302+
await cacheModel(model.cacheKey, arrayBuffer).catch(() => {});
251303

252304
showLoading('Loading model into WASM...');
253-
loadModelFromBytes(data, 'smollm2-135m-instruct-q8_0.gguf');
305+
loadModelFromBytes(data, model.name);
254306
} catch (err) {
255307
hideLoading();
256-
btn.disabled = false;
257-
btn.textContent = '▶ Try with SmolLM2-135M (~135 MB download)';
308+
cards.forEach(c => c.style.pointerEvents = '');
309+
activeModelId = null;
258310
alert('Download failed: ' + err.message + '\n\nTry dropping a local GGUF file instead.');
259311
}
260312
}
261313

262-
// Auto-load cached model on page load
314+
// Auto-detect cached models on page load and show badges
263315
window.addEventListener('load', async () => {
264316
try {
265-
const cached = await getCachedModel(DEMO_KEY);
266-
if (cached) {
267-
const btn = document.getElementById('demoBtn');
268-
btn.textContent = '▶ Load cached SmolLM2-135M (instant)';
269-
btn.style.background = '#047857';
317+
for (const [id, model] of Object.entries(MODELS)) {
318+
const cached = await getCachedModel(model.cacheKey);
319+
if (cached) {
320+
const cards = document.querySelectorAll('.model-card');
321+
cards.forEach(card => {
322+
if (card.querySelector('.name').textContent.toLowerCase().includes(id.split('-')[0])) {
323+
const meta = card.querySelector('.meta');
324+
meta.textContent = 'Cached — instant load';
325+
meta.style.color = '#6ee7b7';
326+
}
327+
});
328+
}
270329
}
271330
} catch(e) {}
272331
});
@@ -275,7 +334,11 @@ <h2>LLM in Your Browser — 189 KB</h2>
275334
const chat = document.getElementById('chat');
276335
const div = document.createElement('div');
277336
div.className = `message ${role}`;
278-
div.innerHTML = formatText(text);
337+
if (role === 'assistant') {
338+
div.textContent = '';
339+
} else {
340+
div.innerHTML = formatText(text);
341+
}
279342
chat.appendChild(div);
280343
chat.scrollTop = chat.scrollHeight;
281344
return div;
@@ -290,7 +353,6 @@ <h2>LLM in Your Browser — 189 KB</h2>
290353
}
291354

292355
function loadModelFromBytes(bytes, name) {
293-
// Shared model loading from Uint8Array (used by both file drop and demo download)
294356
try {
295357
Module.FS.writeFile('/model.gguf', bytes);
296358
showLoading('Initializing model...');
@@ -318,6 +380,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
318380
async function loadModel(file) {
319381
showLoading(`Loading ${file.name} (${(file.size/1024/1024).toFixed(0)} MB)...`);
320382
addMessage('system', `Loading ${file.name}...`);
383+
activeModelId = null; // custom model — use generic template
321384
try {
322385
const buffer = await file.arrayBuffer();
323386
const bytes = new Uint8Array(buffer);
@@ -328,6 +391,14 @@ <h2>LLM in Your Browser — 189 KB</h2>
328391
hideLoading();
329392
}
330393

394+
function getChatPrompt(text) {
395+
if (activeModelId && MODELS[activeModelId]) {
396+
return MODELS[activeModelId].chatTemplate(text);
397+
}
398+
// Generic ChatML fallback for custom GGUF
399+
return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
400+
}
401+
331402
async function generate() {
332403
if (!modelLoaded || generating) return;
333404
const input = document.getElementById('prompt');
@@ -337,45 +408,65 @@ <h2>LLM in Your Browser — 189 KB</h2>
337408
input.value = '';
338409
generating = true;
339410
document.getElementById('sendBtn').disabled = true;
411+
input.disabled = true;
340412

341413
addMessage('user', text);
342-
const assistantDiv = addMessage('assistant', '<span class="cursor">▌</span>');
414+
const assistantDiv = addMessage('assistant', '');
343415
let output = '';
416+
let tokenCount = 0;
417+
const startTime = performance.now();
344418

345-
// Set callbacks
419+
// Set streaming token callback
346420
Module.onToken = (token) => {
347421
output += token;
348-
assistantDiv.innerHTML = formatText(output) + '<span class="cursor">▌</span>';
349-
document.getElementById('chat').scrollTop = document.getElementById('chat').scrollHeight;
422+
tokenCount++;
423+
// Update the assistant message with raw text + blinking cursor
424+
assistantDiv.textContent = output;
425+
const cursor = document.createElement('span');
426+
cursor.className = 'cursor';
427+
cursor.textContent = '▌';
428+
assistantDiv.appendChild(cursor);
429+
// Auto-scroll
430+
const chat = document.getElementById('chat');
431+
chat.scrollTop = chat.scrollHeight;
432+
// Live stats
433+
const elapsed = (performance.now() - startTime) / 1000;
434+
if (elapsed > 0.1) {
435+
document.getElementById('statTokens').textContent = `${tokenCount} tokens`;
436+
document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`;
437+
}
350438
};
439+
351440
Module.onDone = (nTokens, elapsedMs) => {
441+
// Final render with markdown formatting
352442
assistantDiv.innerHTML = formatText(output);
353-
const tps = (nTokens / (elapsedMs / 1000)).toFixed(1);
443+
const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0';
354444
document.getElementById('statTokens').textContent = `${nTokens} tokens`;
355445
document.getElementById('statSpeed').textContent = `${tps} tok/s`;
356446
generating = false;
357447
document.getElementById('sendBtn').disabled = false;
358-
document.getElementById('prompt').focus();
359-
};
360-
Module.onStatus = (msg) => {
361-
addMessage('system', msg);
448+
input.disabled = false;
449+
input.focus();
362450
};
363451

364-
// Wrap with ChatML template (instruct models need this to generate)
365-
const chatPrompt = `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
452+
const chatPrompt = getChatPrompt(text);
366453

367-
// Run generation asynchronously so the UI doesn't freeze
368-
setTimeout(() => {
369-
const promptPtr = Module.allocateUTF8(chatPrompt);
454+
// Use ASYNCIFY: _wasm_generate_async yields to browser between tokens
455+
const promptPtr = Module.allocateUTF8(chatPrompt);
456+
try {
457+
await Module._wasm_generate_async(promptPtr, 0.7, 256);
458+
} catch(e) {
459+
// Fallback for non-ASYNCIFY builds
370460
Module._wasm_generate(promptPtr, 0.7, 256);
371-
Module._free(promptPtr);
461+
}
462+
Module._free(promptPtr);
372463

373-
if (!output) {
374-
assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
375-
}
376-
generating = false;
377-
document.getElementById('sendBtn').disabled = false;
378-
}, 50); // yield to browser for one frame to show the spinner
464+
if (!output) {
465+
assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
466+
}
467+
generating = false;
468+
document.getElementById('sendBtn').disabled = false;
469+
input.disabled = false;
379470
}
380471
</script>
381472

@@ -389,7 +480,7 @@ <h2>LLM in Your Browser — 189 KB</h2>
389480
printErr: function(text) { console.warn(text); },
390481
onRuntimeInitialized: function() {
391482
console.log('quant.cpp WASM ready');
392-
addMessage('system', 'Runtime ready. Drop a GGUF model file to begin.');
483+
addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
393484
}
394485
};
395486
</script>

0 commit comments

Comments
 (0)