quantumaikr
diff --git a/‎src/engine/tq_transformer.c‎
Lines changed: 14 additions & 2 deletions b/‎src/engine/tq_transformer.c‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎wasm/build.sh‎
Lines changed: 7 additions & 12 deletions b/‎wasm/build.sh‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎wasm/index.html‎
Lines changed: 85 additions & 91 deletions b/‎wasm/index.html‎
Lines changed: 85 additions & 91 deletions
diff --git a/‎wasm/inference-worker.js‎
Lines changed: 0 additions & 67 deletions b/‎wasm/inference-worker.js‎
Lines changed: 0 additions & 67 deletions
diff --git a/‎wasm/quant.js‎
Lines changed: 1 addition & 1 deletion b/‎wasm/quant.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎wasm/quant.wasm‎
72.8 KB b/‎wasm/quant.wasm‎
72.8 KB
@@ -1586,7 +1586,13 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
                             + (size_t)l * s->quant_kv_stride
                             + (size_t)t * cache_n_kv_heads * s->quant_head_stride
                             + (size_t)kv_h * s->quant_head_stride;
-                        traits->dequantize(quant_src, dequant_buf, head_dim);
+                        /* Multi-block dequant for head_dim > TQ_BK */
+                        for (int blk = 0; blk < head_dim; blk += TQ_BK) {
+                            int blen = head_dim - blk;
+                            if (blen > TQ_BK) blen = TQ_BK;
+                            traits->dequantize(quant_src + (blk / TQ_BK) * traits->type_size,
+                                               dequant_buf + blk, blen);
+                        }
                         for (int d = 0; d < head_dim; d++) {
                             recon_key[d] += dequant_buf[d];
                         }
@@ -1703,7 +1709,13 @@ static void self_attn_forward(tq_model_t* model, tq_state_t* s, int l, int pos)
                             + (size_t)l * s->quant_kv_stride
                             + (size_t)t * cache_n_kv_heads * s->quant_head_stride
                             + (size_t)kv_h * s->quant_head_stride;
-                        traits->dequantize(quant_src, dequant_buf, head_dim);
+                        /* Multi-block dequant for head_dim > TQ_BK */
+                        for (int blk = 0; blk < head_dim; blk += TQ_BK) {
+                            int blen = head_dim - blk;
+                            if (blen > TQ_BK) blen = TQ_BK;
+                            traits->dequantize(quant_src + (blk / TQ_BK) * traits->type_size,
+                                               dequant_buf + blk, blen);
+                        }
                         if (needs_post_norm) {
                             tq_rmsnorm(dequant_buf, dequant_buf, layer->k_norm,
                                        head_dim, c->rms_norm_eps);
 
@@ -1,17 +1,12 @@
 #!/bin/bash
-# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY)
-# Requires: Emscripten SDK (emcc)
-#
-# Architecture: inference runs in a Web Worker (inference-worker.js)
-# so the main thread stays responsive. No ASYNCIFY needed — the worker
-# blocks on quant_generate() while postMessage streams tokens.
-
+# Build quant.cpp WASM demo
+# pthreads for parallel matmul + SIMD + ASYNCIFY for UI streaming
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 
-echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ==="
+echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
 
 if ! command -v emcc &>/dev/null; then
     echo "Error: emcc not found. Install Emscripten SDK."
@@ -32,14 +27,17 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
     -s INITIAL_MEMORY=1GB \
     -s MAXIMUM_MEMORY=4GB \
     -s ALLOW_MEMORY_GROWTH=0 \
-    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
+    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
     -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
     -s FORCE_FILESYSTEM=1 \
     -s MODULARIZE=0 \
     -s ENVIRONMENT='web,worker' \
     -s NO_EXIT_RUNTIME=1 \
     -s ASSERTIONS=0 \
     -s STACK_SIZE=1MB \
+    -s ASYNCIFY \
+    -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
+    -s ASYNCIFY_STACK_SIZE=65536 \
     -s PTHREAD_POOL_SIZE=4 \
     -s PTHREAD_POOL_SIZE_STRICT=0 \
     -lm \
@@ -53,6 +51,3 @@ echo "=== Build complete ==="
 for f in quant.js quant.wasm; do
     [ -f "$SCRIPT_DIR/$f" ] && echo "  $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
 done
-echo ""
-echo "  inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)"
-echo "  coi-serviceworker.js — COOP/COEP header injection for pthreads"
@@ -356,20 +356,37 @@ <h2>LLM in Your Browser</h2>
 }
 
 function loadModelFromBytes(bytes, name) {
-    showLoading('Loading model into WASM...');
-    // Transfer ArrayBuffer to worker (zero-copy)
-    const buffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
-    worker.postMessage({ type: 'load', bytes: buffer, name: name }, [buffer]);
+    try {
+        Module.FS.writeFile('/model.gguf', bytes);
+        showLoading('Initializing model...');
+        const rc = Module._wasm_load_model(Module.allocateUTF8('/model.gguf'));
+        if (rc === 0) {
+            modelLoaded = true;
+            const dropzone = document.getElementById('dropzone');
+            dropzone.classList.add('loaded');
+            dropzone.innerHTML = `<h2>✓ ${name} (${(bytes.length/1048576).toFixed(0)} MB)</h2>
+                <p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
+            document.getElementById('kvBadge').style.display = '';
+            document.getElementById('prompt').disabled = false;
+            document.getElementById('sendBtn').disabled = false;
+            document.getElementById('prompt').focus();
+            addMessage('system', `Model loaded! ${name} (${(bytes.length/1048576).toFixed(0)} MB). Ask anything.`);
+        } else {
+            addMessage('system', 'Failed to load model.');
+        }
+    } catch(e) {
+        addMessage('system', `Error: ${e.message}`);
+    }
+    hideLoading();
 }
 
 async function loadModel(file) {
     showLoading(`Loading ${file.name} (${(file.size/1024/1024).toFixed(0)} MB)...`);
     addMessage('system', `Loading ${file.name}...`);
-    activeModelId = null; // custom model — use generic template
+    activeModelId = null;
     try {
         const buffer = await file.arrayBuffer();
-        const bytes = new Uint8Array(buffer);
-        loadModelFromBytes(bytes, file.name);
+        loadModelFromBytes(new Uint8Array(buffer), file.name);
     } catch(e) {
         addMessage('system', `Error: ${e.message}`);
     }
@@ -380,85 +397,11 @@ <h2>LLM in Your Browser</h2>
     if (activeModelId && MODELS[activeModelId]) {
         return MODELS[activeModelId].chatTemplate(text);
     }
-    // Generic ChatML fallback for custom GGUF
     return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
 }
 
-// ---- Web Worker inference engine (no ASYNCIFY overhead) ----
-let worker = null;
-let pendingAssistantDiv = null;
-let pendingOutput = '';
-let pendingTokenCount = 0;
-let pendingStartTime = 0;
-
-function initWorker() {
-    worker = new Worker('inference-worker.js');
-    worker.onmessage = function(e) {
-        const msg = e.data;
-
-        if (msg.type === 'ready') {
-            addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
-        }
-        else if (msg.type === 'status') {
-            if (msg.msg === 'thinking' && pendingAssistantDiv) {
-                pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
-                document.getElementById('statTokens').textContent = 'Processing prompt...';
-                document.getElementById('statSpeed').textContent = '';
-            } else {
-                addMessage('system', msg.msg);
-            }
-        }
-        else if (msg.type === 'loaded') {
-            modelLoaded = true;
-            const dropzone = document.getElementById('dropzone');
-            dropzone.classList.add('loaded');
-            dropzone.innerHTML = `<h2>✓ ${msg.name} (${(msg.size/1048576).toFixed(0)} MB)</h2>
-                <p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
-            document.getElementById('kvBadge').style.display = '';
-            document.getElementById('prompt').disabled = false;
-            document.getElementById('sendBtn').disabled = false;
-            document.getElementById('prompt').focus();
-            hideLoading();
-        }
-        else if (msg.type === 'token' && pendingAssistantDiv) {
-            pendingOutput += msg.text;
-            pendingTokenCount++;
-            pendingAssistantDiv.textContent = pendingOutput;
-            const cursor = document.createElement('span');
-            cursor.className = 'cursor';
-            cursor.textContent = '▌';
-            pendingAssistantDiv.appendChild(cursor);
-            const chat = document.getElementById('chat');
-            chat.scrollTop = chat.scrollHeight;
-            const elapsed = (performance.now() - pendingStartTime) / 1000;
-            if (elapsed > 0.1) {
-                document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
-                document.getElementById('statSpeed').textContent = `${(pendingTokenCount / elapsed).toFixed(1)} tok/s`;
-            }
-        }
-        else if (msg.type === 'done') {
-            if (pendingAssistantDiv) {
-                if (pendingOutput) {
-                    pendingAssistantDiv.innerHTML = formatText(pendingOutput);
-                } else {
-                    pendingAssistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
-                }
-                const elapsed = (performance.now() - pendingStartTime) / 1000;
-                const tps = pendingTokenCount > 0 ? (pendingTokenCount / elapsed).toFixed(1) : '0';
-                document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
-                document.getElementById('statSpeed').textContent = `${tps} tok/s`;
-            }
-            generating = false;
-            document.getElementById('sendBtn').disabled = false;
-            document.getElementById('prompt').disabled = false;
-            document.getElementById('prompt').focus();
-            pendingAssistantDiv = null;
-        }
-    };
-}
-
-function generate() {
-    if (!modelLoaded || generating || !worker) return;
+async function generate() {
+    if (!modelLoaded || generating) return;
     const input = document.getElementById('prompt');
     const text = input.value.trim();
     if (!text) return;
@@ -469,19 +412,70 @@ <h2>LLM in Your Browser</h2>
     input.disabled = true;
 
     addMessage('user', text);
-    pendingAssistantDiv = addMessage('assistant', '');
-    pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
-    pendingOutput = '';
-    pendingTokenCount = 0;
-    pendingStartTime = performance.now();
+    const assistantDiv = addMessage('assistant', '');
+    assistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
+    let output = '';
+    let tokenCount = 0;
+    const startTime = performance.now();
+    document.getElementById('statTokens').textContent = 'Processing prompt...';
+    document.getElementById('statSpeed').textContent = '';
+
+    Module.onToken = (token) => {
+        output += token;
+        tokenCount++;
+        assistantDiv.textContent = output;
+        const cursor = document.createElement('span');
+        cursor.className = 'cursor';
+        cursor.textContent = '▌';
+        assistantDiv.appendChild(cursor);
+        document.getElementById('chat').scrollTop = document.getElementById('chat').scrollHeight;
+        const elapsed = (performance.now() - startTime) / 1000;
+        if (elapsed > 0.1) {
+            document.getElementById('statTokens').textContent = `${tokenCount} tokens`;
+            document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`;
+        }
+    };
+
+    Module.onDone = (nTokens, elapsedMs) => {
+        assistantDiv.innerHTML = formatText(output);
+        const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0';
+        document.getElementById('statTokens').textContent = `${nTokens} tokens`;
+        document.getElementById('statSpeed').textContent = `${tps} tok/s`;
+        generating = false;
+        document.getElementById('sendBtn').disabled = false;
+        input.disabled = false;
+        input.focus();
+    };
 
     const chatPrompt = getChatPrompt(text);
-    worker.postMessage({ type: 'generate', prompt: chatPrompt, temperature: 0.7, maxTokens: 256 });
+    const promptPtr = Module.allocateUTF8(chatPrompt);
+    try {
+        await Module._wasm_generate_async(promptPtr, 0.7, 256);
+    } catch(e) {
+        Module._wasm_generate(promptPtr, 0.7, 256);
+    }
+    Module._free(promptPtr);
+
+    if (!output) {
+        assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
+    }
+    generating = false;
+    document.getElementById('sendBtn').disabled = false;
+    input.disabled = false;
 }
+</script>
 
-// Initialize worker on page load
-initWorker();
+<script>
+var Module = {
+    onToken: null, onDone: null, onStatus: null,
+    print: function(text) { console.log(text); },
+    printErr: function(text) { console.warn(text); },
+    onRuntimeInitialized: function() {
+        addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
+    }
+};
 </script>
+<script src="quant.js"></script>
 
 </body>
 </html>