Skip to content

Commit 4cc5598

Browse files
unamedkrclaude
andauthored
feat(wasm): SmolLM2-135M fast default + Llama 1B quality option (#37)
* feat(wasm): Llama 3.2 1B Instruct default + skip Q4 reconversion Two changes for WASM demo reliability and speed: 1. Model: switch from Qwen3.5-0.8B (base, gated, Qwen arch issues) to Llama 3.2 1B Instruct (verified working, good quality, public HuggingFace URL, proper Instruct tuning for chat). 2. Speed: add -DTQ_NO_Q4=1 to WASM build. Skips the load-time Q4 reconversion (GGUF Q4_K_M → FP32 → internal Q4) which was expensive and redundant for already-quantized models. Uses GGUF on-the-fly dequant instead. Saves several seconds of model init and reduces peak memory usage. Added compile-time #ifdef TQ_NO_Q4 guard in quant.h so it works in WASM (no getenv). Native builds are unaffected. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat(wasm): SmolLM2-135M default (fast) + Llama 1B option (quality) 1B model causes 15-30s+ prefill hang in WASM — unusable as default. SmolLM2-135M: 135MB download, <2s prefill, ~10-20 tok/s in WASM. Quality is basic but responsive — proper demo experience. Llama 3.2 1B Instruct kept as "Quality" option for users willing to wait for the larger model. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6f25f34 commit 4cc5598

2 files changed

Lines changed: 16 additions & 57 deletions

File tree

quant.h

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
202202
// Section 1: Types and Specs (from tq_types.h, tq_spec.h)
203203
// ============================================================================
204204

205-
206-
207205
/* Cross-language static assert: works in both C11 and C++11/17 */
208206
#ifdef __cplusplus
209207
#define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
219217
#define TQ_PI_2 1.5707963267948966f
220218
#endif
221219

222-
223-
224220
/* ============================================================
225221
* Constants
226222
* ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
398394
int enable_recompression;/* Tier 1 → Tier 2 re-compression */
399395
} tq_progressive_config_t;
400396

401-
402-
403397
/* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
404398
* 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
405399
* Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
469463
TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
470464
TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
471465

472-
473-
474-
475-
476-
477-
478466
/* Format specification — version-aware, ONNX-inspired */
479467

480468
#define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
500488
uint8_t flags; /* TQ_FLAG_* bitmask */
501489
} tq_format_spec_t;
502490

503-
504-
505-
506-
507491
// ============================================================================
508492
// Section 2: Engine Types (from tq_engine.h)
509493
// ============================================================================
510494

511-
512-
513-
514-
515495
/* ============================================================
516496
* Model configuration
517497
* ============================================================ */
@@ -1123,9 +1103,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
11231103
/* Max threads supported by thread pool */
11241104
#define TQ_TP_MAX 16
11251105

1126-
1127-
1128-
11291106
// ============================================================================
11301107
// Section 3: GGUF Types (from tq_gguf.h)
11311108
// ============================================================================
@@ -1143,10 +1120,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
11431120
* directly into TurboQuant inference engine.
11441121
*/
11451122

1146-
1147-
1148-
1149-
11501123
/* ============================================================
11511124
* GGUF format constants
11521125
* ============================================================ */
@@ -1462,24 +1435,17 @@ int tq_metal_moe_forward(
14621435
const int* up_types, /* per-expert up quant types, NULL = use weight_type */
14631436
const int* down_types); /* per-expert down quant types, NULL = use weight_type */
14641437

1465-
1466-
1467-
14681438
// ============================================================================
14691439
// Section 4: Internal API (from turboquant.h)
14701440
// ============================================================================
14711441

1472-
14731442
/**
14741443
* TurboQuant.cpp — Cross-platform KV cache compression library
14751444
*
14761445
* Public C API — single header include for all functionality.
14771446
* Zero external dependencies (libc/libm only).
14781447
*/
14791448

1480-
1481-
1482-
14831449
/* ============================================================
14841450
* Version
14851451
* ============================================================ */
@@ -1753,15 +1719,10 @@ void tq_progressive_free(tq_progressive_t* p);
17531719

17541720
tq_progressive_config_t tq_progressive_default_config(void);
17551721

1756-
1757-
1758-
1759-
17601722
// ============================================================================
17611723
// Section 5: quant_ctx struct definition
17621724
// ============================================================================
17631725

1764-
17651726
struct quant_ctx {
17661727
tq_model_t* model;
17671728
tq_state_t* state;
@@ -1788,7 +1749,6 @@ struct quant_ctx {
17881749
* - Random signs decorrelate channels across different blocks
17891750
*/
17901751

1791-
17921752
#ifdef __ARM_NEON
17931753
#include <arm_neon.h>
17941754
#endif
@@ -1902,7 +1862,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
19021862
*/
19031863
/* Generic reference — no compiler-specific pragmas */
19041864

1905-
19061865
/* ---------- FP16 helpers ---------- */
19071866

19081867
static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2244,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
22852244
// Section 8: Type Traits (from tq_traits.c)
22862245
// ============================================================================
22872246

2288-
22892247
/* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
22902248
static void tq_stub_quantize(const float* src, void* dst, int n) {
22912249
(void)src; (void)dst; (void)n;
@@ -2583,7 +2541,6 @@ tq_type tq_type_from_name(const char* name) {
25832541
* No external dependencies — libc/libm only.
25842542
*/
25852543

2586-
25872544
#ifdef __ARM_NEON
25882545
#include <arm_neon.h>
25892546
#endif
@@ -2617,7 +2574,6 @@ static struct {
26172574

26182575
static int g_n_threads = 1;
26192576

2620-
26212577
static void* tp_worker(void* arg) {
26222578
int id = (int)(intptr_t)arg;
26232579
int my_gen = 0;
@@ -4388,8 +4344,6 @@ void tq_matmul_1bit(float* out, const float* x,
43884344
* SPDX-License-Identifier: MIT
43894345
*/
43904346

4391-
4392-
43934347
#ifdef _WIN32
43944348
#else
43954349
#endif
@@ -5098,8 +5052,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
50985052
* Pure C11, no external dependencies.
50995053
*/
51005054

5101-
5102-
51035055
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
51045056
#include <arm_neon.h>
51055057
#define TQ_HAS_NEON 1
@@ -7174,7 +7126,6 @@ void tq_metal_batch_end_if_available(void) {
71747126
* Also supports the legacy llama2.c binary tokenizer format as fallback.
71757127
*/
71767128

7177-
71787129
/* Global for qsort comparator (vocab index sorting) */
71797130
static char** g_vocab_for_sort;
71807131
static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8470,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
85198470
* Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
85208471
*/
85218472

8522-
85238473
#ifdef _WIN32
85248474
#else
85258475
#endif
@@ -12934,7 +12884,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
1293412884
* -> residual add
1293512885
*/
1293612886

12937-
1293812887
/* Unified Q2/1-bit matmul dispatch.
1293912888
* When model->use_1bit_weights, Q2 fields contain sign bits + norms,
1294012889
* dispatched to tq_matmul_1bit (FP32 input required).
@@ -15194,7 +15143,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1519415143
}
1519515144
}
1519615145

15197-
1519815146
/* Increment profile token count if profiling is active */
1519915147
if (s->profile_kv) {
1520015148
s->profile_kv_count++;
@@ -15245,7 +15193,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1524515193
* - Full generation loop with streaming callback
1524615194
*/
1524715195

15248-
1524915196
/* ============================================================
1525015197
* Argmax sampling: return token with highest logit
1525115198
* ============================================================ */
@@ -15673,7 +15620,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1567315620
return generated;
1567415621
}
1567515622

15676-
1567715623
// ============================================================================
1567815624

1567915625
// ============================================================================

wasm/index.html

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,15 @@ <h2>Run an <span>LLM</span> in your browser</h2>
174174
<p class="subtitle">No install. No API key. No server.</p>
175175

176176
<div class="model-cards" id="modelCards">
177-
<div class="model-card recommended" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
177+
<div class="model-card recommended" id="card-smol" onclick="loadDemoModel('smollm2-135m')">
178+
<div class="name">SmolLM2 135M</div>
179+
<div class="meta" id="meta-smol">~135 MB &middot; Fast response</div>
180+
<span class="tag">Fast</span>
181+
</div>
182+
<div class="model-card" id="card-llama" onclick="loadDemoModel('llama-3.2-1b')">
178183
<div class="name">Llama 3.2 1B Instruct</div>
179-
<div class="meta" id="meta-llama">~770 MB &middot; Verified quality</div>
180-
<span class="tag">Recommended</span>
184+
<div class="meta" id="meta-llama">~770 MB &middot; Better quality</div>
185+
<span class="tag blue">Quality</span>
181186
</div>
182187
</div>
183188

@@ -218,6 +223,14 @@ <h2>Run an <span>LLM</span> in your browser</h2>
218223
let activeModelId = null;
219224

220225
const MODELS = {
226+
'smollm2-135m': {
227+
url: 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf',
228+
name: 'SmolLM2 135M',
229+
size: 135,
230+
cacheKey: 'smollm2-135m-q8',
231+
chatTemplate: (t) => t, // SmolLM2 works best with plain text prompts
232+
cardId: 'card-smol', metaId: 'meta-smol',
233+
},
221234
'llama-3.2-1b': {
222235
url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
223236
name: 'Llama 3.2 1B Instruct',

0 commit comments

Comments
 (0)