feat(wasm): SmolLM2-135M fast default + Llama 1B quality option (#37)

unamedkr · claude · web-flow · commit 4cc5598c96cd · 2026-04-10T21:01:20.000+09:00
* feat(wasm): Llama 3.2 1B Instruct default + skip Q4 reconversion

Two changes for WASM demo reliability and speed:

1. Model: switch from Qwen3.5-0.8B (base, gated, Qwen arch issues)
   to Llama 3.2 1B Instruct (verified working, good quality, public
   HuggingFace URL, proper Instruct tuning for chat).

2. Speed: add -DTQ_NO_Q4=1 to WASM build. Skips the load-time Q4
   reconversion (GGUF Q4_K_M → FP32 → internal Q4) which was
   expensive and redundant for already-quantized models. Uses GGUF
   on-the-fly dequant instead. Saves several seconds of model init
   and reduces peak memory usage.

   Added compile-time #ifdef TQ_NO_Q4 guard in quant.h so it works
   in WASM (no getenv). Native builds are unaffected.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* feat(wasm): SmolLM2-135M default (fast) + Llama 1B option (quality)

1B model causes 15-30s+ prefill hang in WASM — unusable as default.
SmolLM2-135M: 135MB download, &lt;2s prefill, ~10-20 tok/s in WASM.
Quality is basic but responsive — proper demo experience.

Llama 3.2 1B Instruct kept as "Quality" option for users willing
to wait for the larger model.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/quant.h b/quant.h
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
 // ============================================================================
 
-
-
 /* Cross-language static assert: works in both C11 and C++11/17 */
 #ifdef __cplusplus
 #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 #define TQ_PI_2 1.5707963267948966f
 #endif
 
-
-
 /* ============================================================
  * Constants
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
     int      enable_recompression;/* Tier 1 → Tier 2 re-compression   */
 } tq_progressive_config_t;
 
-
-
 /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 
-
-
-
-
-
-
 /* Format specification — version-aware, ONNX-inspired */
 
 #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
     uint8_t  flags;            /* TQ_FLAG_* bitmask                 */
 } tq_format_spec_t;
 
-
-
-
-
 // ============================================================================
 // Section 2: Engine Types (from tq_engine.h)
 // ============================================================================
 
-
-
-
-
 /* ============================================================
  * Model configuration
  * ============================================================ */
@@ -1123,9 +1103,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
 /* Max threads supported by thread pool */
 #define TQ_TP_MAX 16
 
-
-
-
 // ============================================================================
 // Section 3: GGUF Types (from tq_gguf.h)
 // ============================================================================
@@ -1143,10 +1120,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
  * directly into TurboQuant inference engine.
  */
 
-
-
-
-
 /* ============================================================
  * GGUF format constants
  * ============================================================ */
@@ -1462,24 +1435,17 @@ int tq_metal_moe_forward(
     const int*      up_types,       /* per-expert up quant types, NULL = use weight_type */
     const int*      down_types);    /* per-expert down quant types, NULL = use weight_type */
 
-
-
-
 // ============================================================================
 // Section 4: Internal API (from turboquant.h)
 // ============================================================================
 
-
 /**
  * TurboQuant.cpp — Cross-platform KV cache compression library
  *
  * Public C API — single header include for all functionality.
  * Zero external dependencies (libc/libm only).
  */
 
-
-
-
 /* ============================================================
  * Version
  * ============================================================ */
@@ -1753,15 +1719,10 @@ void      tq_progressive_free(tq_progressive_t* p);
 
 tq_progressive_config_t tq_progressive_default_config(void);
 
-
-
-
-
 // ============================================================================
 // Section 5: quant_ctx struct definition
 // ============================================================================
 
-
 struct quant_ctx {
     tq_model_t* model;
     tq_state_t* state;
@@ -1788,7 +1749,6 @@ struct quant_ctx {
  * - Random signs decorrelate channels across different blocks
  */
 
-
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -1902,7 +1862,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
  */
 /* Generic reference — no compiler-specific pragmas */
 
-
 /* ---------- FP16 helpers ---------- */
 
 static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2244,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
 // Section 8: Type Traits (from tq_traits.c)
 // ============================================================================
 
-
 /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
 static void tq_stub_quantize(const float* src, void* dst, int n) {
     (void)src; (void)dst; (void)n;
@@ -2583,7 +2541,6 @@ tq_type tq_type_from_name(const char* name) {
  * No external dependencies — libc/libm only.
  */
 
-
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -2617,7 +2574,6 @@ static struct {
 
 static int g_n_threads = 1;
 
-
 static void* tp_worker(void* arg) {
     int id = (int)(intptr_t)arg;
     int my_gen = 0;
@@ -4388,8 +4344,6 @@ void tq_matmul_1bit(float* out, const float* x,
  * SPDX-License-Identifier: MIT
  */
 
-
-
 #ifdef _WIN32
 #else
 #endif
@@ -5098,8 +5052,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
  * Pure C11, no external dependencies.
  */
 
-
-
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include <arm_neon.h>
 #define TQ_HAS_NEON 1
@@ -7174,7 +7126,6 @@ void tq_metal_batch_end_if_available(void) {
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
  */
 
-
 /* Global for qsort comparator (vocab index sorting) */
 static char** g_vocab_for_sort;
 static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8470,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
  */
 
-
 #ifdef _WIN32
 #else
 #endif
@@ -12934,7 +12884,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
  *   -> residual add
  */
 
-
 /* Unified Q2/1-bit matmul dispatch.
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15194,7 +15143,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
     }
 
-
     /* Increment profile token count if profiling is active */
     if (s->profile_kv) {
         s->profile_kv_count++;
@@ -15245,7 +15193,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
  *   - Full generation loop with streaming callback
  */
 
-
 /* ============================================================
  * Argmax sampling: return token with highest logit
  * ============================================================ */
@@ -15673,7 +15620,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     return generated;
 }
 
-
 // ============================================================================
 
 // ============================================================================
diff --git a/wasm/index.html b/wasm/index.html
@@ -174,10 +174,15 @@ <h2>Run an <span>LLM</span> in your browser</h2>
         <p class="subtitle">No install. No API key. No server.</p>
 
         <div class="model-cards" id="modelCards">
-            <div class="model-card recommended" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
+            <div class="model-card recommended" id="card-smol" onclick="loadDemoModel('smollm2-135m')">
+                <div class="name">SmolLM2 135M</div>
+                <div class="meta" id="meta-smol">~135 MB &middot; Fast response</div>
+                <span class="tag">Fast</span>
+            </div>
+            <div class="model-card" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
                 <div class="name">Llama 3.2 1B Instruct</div>
-                <div class="meta" id="meta-llama">~770 MB &middot; Verified quality</div>
-                <span class="tag">Recommended</span>
+                <div class="meta" id="meta-llama">~770 MB &middot; Better quality</div>
+                <span class="tag blue">Quality</span>
             </div>
         </div>
 
@@ -218,6 +223,14 @@ <h2>Run an <span>LLM</span> in your browser</h2>
 let activeModelId = null;
 
 const MODELS = {
+    'smollm2-135m': {
+        url: 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf',
+        name: 'SmolLM2 135M',
+        size: 135,
+        cacheKey: 'smollm2-135m-q8',
+        chatTemplate: (t) => t,  // SmolLM2 works best with plain text prompts
+        cardId: 'card-smol', metaId: 'meta-smol',
+    },
     'llama-3.2-1b': {
         url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
         name: 'Llama 3.2 1B Instruct',