Skip to content

Commit 20615ed

Browse files
unamedkrclaude
andcommitted
fix: multi-block dequant for head_dim > TQ_BK
The quantize path was fixed in the previous commit but dequant was missed — only the first 128 elements were being dequantized, leaving the rest as zeros. This caused PPL 5188 (vs FP32 893) on Qwen3.5. After fix: Qwen3.5 turbo_kv_4b produces coherent text at 12.4 tok/s. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7d10b63 commit 20615ed

7 files changed

Lines changed: 162 additions & 246 deletions

File tree

src/engine/tq_transformer.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,7 +1586,13 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
15861586
+ (size_t)l * s->quant_kv_stride
15871587
+ (size_t)t * cache_n_kv_heads * s->quant_head_stride
15881588
+ (size_t)kv_h * s->quant_head_stride;
1589-
traits->dequantize(quant_src, dequant_buf, head_dim);
1589+
/* Multi-block dequant for head_dim > TQ_BK */
1590+
for (int blk = 0; blk < head_dim; blk += TQ_BK) {
1591+
int blen = head_dim - blk;
1592+
if (blen > TQ_BK) blen = TQ_BK;
1593+
traits->dequantize(quant_src + (blk / TQ_BK) * traits->type_size,
1594+
dequant_buf + blk, blen);
1595+
}
15901596
for (int d = 0; d < head_dim; d++) {
15911597
recon_key[d] += dequant_buf[d];
15921598
}
@@ -1703,7 +1709,13 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
17031709
+ (size_t)l * s->quant_kv_stride
17041710
+ (size_t)t * cache_n_kv_heads * s->quant_head_stride
17051711
+ (size_t)kv_h * s->quant_head_stride;
1706-
traits->dequantize(quant_src, dequant_buf, head_dim);
1712+
/* Multi-block dequant for head_dim > TQ_BK */
1713+
for (int blk = 0; blk < head_dim; blk += TQ_BK) {
1714+
int blen = head_dim - blk;
1715+
if (blen > TQ_BK) blen = TQ_BK;
1716+
traits->dequantize(quant_src + (blk / TQ_BK) * traits->type_size,
1717+
dequant_buf + blk, blen);
1718+
}
17071719
if (needs_post_norm) {
17081720
tq_rmsnorm(dequant_buf, dequant_buf, layer->k_norm,
17091721
head_dim, c->rms_norm_eps);

wasm/build.sh

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
#!/bin/bash
2-
# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY)
3-
# Requires: Emscripten SDK (emcc)
4-
#
5-
# Architecture: inference runs in a Web Worker (inference-worker.js)
6-
# so the main thread stays responsive. No ASYNCIFY needed — the worker
7-
# blocks on quant_generate() while postMessage streams tokens.
8-
2+
# Build quant.cpp WASM demo
3+
# pthreads for parallel matmul + SIMD + ASYNCIFY for UI streaming
94
set -e
105

116
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
127
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
138

14-
echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ==="
9+
echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
1510

1611
if ! command -v emcc &>/dev/null; then
1712
echo "Error: emcc not found. Install Emscripten SDK."
@@ -32,14 +27,17 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
3227
-s INITIAL_MEMORY=1GB \
3328
-s MAXIMUM_MEMORY=4GB \
3429
-s ALLOW_MEMORY_GROWTH=0 \
35-
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
30+
-s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
3631
-s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
3732
-s FORCE_FILESYSTEM=1 \
3833
-s MODULARIZE=0 \
3934
-s ENVIRONMENT='web,worker' \
4035
-s NO_EXIT_RUNTIME=1 \
4136
-s ASSERTIONS=0 \
4237
-s STACK_SIZE=1MB \
38+
-s ASYNCIFY \
39+
-s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
40+
-s ASYNCIFY_STACK_SIZE=65536 \
4341
-s PTHREAD_POOL_SIZE=4 \
4442
-s PTHREAD_POOL_SIZE_STRICT=0 \
4543
-lm \
@@ -53,6 +51,3 @@ echo "=== Build complete ==="
5351
for f in quant.js quant.wasm; do
5452
[ -f "$SCRIPT_DIR/$f" ] && echo " $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
5553
done
56-
echo ""
57-
echo " inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)"
58-
echo " coi-serviceworker.js — COOP/COEP header injection for pthreads"

wasm/index.html

Lines changed: 85 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -356,20 +356,37 @@ <h2>LLM in Your Browser</h2>
356356
}
357357

358358
function loadModelFromBytes(bytes, name) {
359-
showLoading('Loading model into WASM...');
360-
// Transfer ArrayBuffer to worker (zero-copy)
361-
const buffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
362-
worker.postMessage({ type: 'load', bytes: buffer, name: name }, [buffer]);
359+
try {
360+
Module.FS.writeFile('/model.gguf', bytes);
361+
showLoading('Initializing model...');
362+
const rc = Module._wasm_load_model(Module.allocateUTF8('/model.gguf'));
363+
if (rc === 0) {
364+
modelLoaded = true;
365+
const dropzone = document.getElementById('dropzone');
366+
dropzone.classList.add('loaded');
367+
dropzone.innerHTML = `<h2>✓ ${name} (${(bytes.length/1048576).toFixed(0)} MB)</h2>
368+
<p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
369+
document.getElementById('kvBadge').style.display = '';
370+
document.getElementById('prompt').disabled = false;
371+
document.getElementById('sendBtn').disabled = false;
372+
document.getElementById('prompt').focus();
373+
addMessage('system', `Model loaded! ${name} (${(bytes.length/1048576).toFixed(0)} MB). Ask anything.`);
374+
} else {
375+
addMessage('system', 'Failed to load model.');
376+
}
377+
} catch(e) {
378+
addMessage('system', `Error: ${e.message}`);
379+
}
380+
hideLoading();
363381
}
364382

365383
async function loadModel(file) {
366384
showLoading(`Loading ${file.name} (${(file.size/1024/1024).toFixed(0)} MB)...`);
367385
addMessage('system', `Loading ${file.name}...`);
368-
activeModelId = null; // custom model — use generic template
386+
activeModelId = null;
369387
try {
370388
const buffer = await file.arrayBuffer();
371-
const bytes = new Uint8Array(buffer);
372-
loadModelFromBytes(bytes, file.name);
389+
loadModelFromBytes(new Uint8Array(buffer), file.name);
373390
} catch(e) {
374391
addMessage('system', `Error: ${e.message}`);
375392
}
@@ -380,85 +397,11 @@ <h2>LLM in Your Browser</h2>
380397
if (activeModelId && MODELS[activeModelId]) {
381398
return MODELS[activeModelId].chatTemplate(text);
382399
}
383-
// Generic ChatML fallback for custom GGUF
384400
return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
385401
}
386402

387-
// ---- Web Worker inference engine (no ASYNCIFY overhead) ----
388-
let worker = null;
389-
let pendingAssistantDiv = null;
390-
let pendingOutput = '';
391-
let pendingTokenCount = 0;
392-
let pendingStartTime = 0;
393-
394-
function initWorker() {
395-
worker = new Worker('inference-worker.js');
396-
worker.onmessage = function(e) {
397-
const msg = e.data;
398-
399-
if (msg.type === 'ready') {
400-
addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
401-
}
402-
else if (msg.type === 'status') {
403-
if (msg.msg === 'thinking' && pendingAssistantDiv) {
404-
pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
405-
document.getElementById('statTokens').textContent = 'Processing prompt...';
406-
document.getElementById('statSpeed').textContent = '';
407-
} else {
408-
addMessage('system', msg.msg);
409-
}
410-
}
411-
else if (msg.type === 'loaded') {
412-
modelLoaded = true;
413-
const dropzone = document.getElementById('dropzone');
414-
dropzone.classList.add('loaded');
415-
dropzone.innerHTML = `<h2>✓ ${msg.name} (${(msg.size/1048576).toFixed(0)} MB)</h2>
416-
<p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
417-
document.getElementById('kvBadge').style.display = '';
418-
document.getElementById('prompt').disabled = false;
419-
document.getElementById('sendBtn').disabled = false;
420-
document.getElementById('prompt').focus();
421-
hideLoading();
422-
}
423-
else if (msg.type === 'token' && pendingAssistantDiv) {
424-
pendingOutput += msg.text;
425-
pendingTokenCount++;
426-
pendingAssistantDiv.textContent = pendingOutput;
427-
const cursor = document.createElement('span');
428-
cursor.className = 'cursor';
429-
cursor.textContent = '▌';
430-
pendingAssistantDiv.appendChild(cursor);
431-
const chat = document.getElementById('chat');
432-
chat.scrollTop = chat.scrollHeight;
433-
const elapsed = (performance.now() - pendingStartTime) / 1000;
434-
if (elapsed > 0.1) {
435-
document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
436-
document.getElementById('statSpeed').textContent = `${(pendingTokenCount / elapsed).toFixed(1)} tok/s`;
437-
}
438-
}
439-
else if (msg.type === 'done') {
440-
if (pendingAssistantDiv) {
441-
if (pendingOutput) {
442-
pendingAssistantDiv.innerHTML = formatText(pendingOutput);
443-
} else {
444-
pendingAssistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
445-
}
446-
const elapsed = (performance.now() - pendingStartTime) / 1000;
447-
const tps = pendingTokenCount > 0 ? (pendingTokenCount / elapsed).toFixed(1) : '0';
448-
document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
449-
document.getElementById('statSpeed').textContent = `${tps} tok/s`;
450-
}
451-
generating = false;
452-
document.getElementById('sendBtn').disabled = false;
453-
document.getElementById('prompt').disabled = false;
454-
document.getElementById('prompt').focus();
455-
pendingAssistantDiv = null;
456-
}
457-
};
458-
}
459-
460-
function generate() {
461-
if (!modelLoaded || generating || !worker) return;
403+
async function generate() {
404+
if (!modelLoaded || generating) return;
462405
const input = document.getElementById('prompt');
463406
const text = input.value.trim();
464407
if (!text) return;
@@ -469,19 +412,70 @@ <h2>LLM in Your Browser</h2>
469412
input.disabled = true;
470413

471414
addMessage('user', text);
472-
pendingAssistantDiv = addMessage('assistant', '');
473-
pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
474-
pendingOutput = '';
475-
pendingTokenCount = 0;
476-
pendingStartTime = performance.now();
415+
const assistantDiv = addMessage('assistant', '');
416+
assistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
417+
let output = '';
418+
let tokenCount = 0;
419+
const startTime = performance.now();
420+
document.getElementById('statTokens').textContent = 'Processing prompt...';
421+
document.getElementById('statSpeed').textContent = '';
422+
423+
Module.onToken = (token) => {
424+
output += token;
425+
tokenCount++;
426+
assistantDiv.textContent = output;
427+
const cursor = document.createElement('span');
428+
cursor.className = 'cursor';
429+
cursor.textContent = '▌';
430+
assistantDiv.appendChild(cursor);
431+
document.getElementById('chat').scrollTop = document.getElementById('chat').scrollHeight;
432+
const elapsed = (performance.now() - startTime) / 1000;
433+
if (elapsed > 0.1) {
434+
document.getElementById('statTokens').textContent = `${tokenCount} tokens`;
435+
document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`;
436+
}
437+
};
438+
439+
Module.onDone = (nTokens, elapsedMs) => {
440+
assistantDiv.innerHTML = formatText(output);
441+
const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0';
442+
document.getElementById('statTokens').textContent = `${nTokens} tokens`;
443+
document.getElementById('statSpeed').textContent = `${tps} tok/s`;
444+
generating = false;
445+
document.getElementById('sendBtn').disabled = false;
446+
input.disabled = false;
447+
input.focus();
448+
};
477449

478450
const chatPrompt = getChatPrompt(text);
479-
worker.postMessage({ type: 'generate', prompt: chatPrompt, temperature: 0.7, maxTokens: 256 });
451+
const promptPtr = Module.allocateUTF8(chatPrompt);
452+
try {
453+
await Module._wasm_generate_async(promptPtr, 0.7, 256);
454+
} catch(e) {
455+
Module._wasm_generate(promptPtr, 0.7, 256);
456+
}
457+
Module._free(promptPtr);
458+
459+
if (!output) {
460+
assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
461+
}
462+
generating = false;
463+
document.getElementById('sendBtn').disabled = false;
464+
input.disabled = false;
480465
}
466+
</script>
481467

482-
// Initialize worker on page load
483-
initWorker();
468+
<script>
469+
var Module = {
470+
onToken: null, onDone: null, onStatus: null,
471+
print: function(text) { console.log(text); },
472+
printErr: function(text) { console.warn(text); },
473+
onRuntimeInitialized: function() {
474+
addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
475+
}
476+
};
484477
</script>
478+
<script src="quant.js"></script>
485479

486480
</body>
487481
</html>

wasm/inference-worker.js

Lines changed: 0 additions & 67 deletions
This file was deleted.

wasm/quant.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wasm/quant.wasm

72.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)