Skip to content

Commit d91a8b2

Browse files
unamedkrclaude
andcommitted
fix(gemma4): PLE buffer overrun broke Gemma 4 E4B
The per-layer embedding (PLE) stack buffers were sized 8960 floats, matching E2B exactly (35 layers × 256 ple_dim). E4B has 42 layers, requiring 10752 elements — a 1792-float stack overrun that caused the forward pass to hang indefinitely (stack corruption triggering infinite loop somewhere downstream). Fixed in both src/engine/tq_transformer.c and quant.h: - temp_embd and temp_proj buffers bumped to 16384 floats - Added safety guard for total_ple > 16384 Before: E4B Q8_0 prompt=5 tokens generates 0 tokens in 15 minutes (stuck in stack-corruption-induced hang) After: E4B Q8_0 generates "4. This is a mathematical fact." E4B Q4_0 generates "4。请问,您能用更" All 35 unit tests pass, all 7 model regression tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5bf3e8e commit d91a8b2

2 files changed

Lines changed: 12 additions & 10 deletions

File tree

quant.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15481,10 +15481,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1548115481
s->ple_buf = (float*)calloc((size_t)total_ple, sizeof(float));
1548215482
}
1548315483

15484-
/* Step A: Dequant per_layer_token_embd[token] → temp_embd[8960]
15484+
/* Step A: Dequant per_layer_token_embd[token] → temp_embd
1548515485
* The embedding tensor is [total_ple, vocab_size] in GGUF row-major,
1548615486
* so one token's data is at row offset = token * row_bytes. */
15487-
float temp_embd[8960]; /* stack buffer, total_ple <= 8960 */
15487+
float temp_embd[16384]; /* stack buffer, E2B=8960, E4B=10752 */
15488+
if (total_ple > 16384) return s->logits; /* safety guard */
1548815489
{
1548915490
size_t type_size = tq_ggml_type_size(model->ple_embedding_type);
1549015491
int blck = tq_ggml_type_blck(model->ple_embedding_type);
@@ -15500,11 +15501,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
1550015501
temp_embd[i] *= ple_scale;
1550115502
}
1550215503

15503-
/* Step B: per_layer_model_proj @ embed_raw → temp_proj[8960]
15504+
/* Step B: per_layer_model_proj @ embed_raw → temp_proj
1550415505
* ple_proj is [total_ple, hidden_dim] FP32 (rows=8960, cols=1536).
1550515506
* We need: for each output row d in [0, total_ple): dot(ple_proj[d,:], s->x[:])
1550615507
* Note: s->x already has the scaled embedding from above. */
15507-
float temp_proj[8960];
15508+
float temp_proj[16384];
1550815509
tq_matmul(temp_proj, s->x, model->ple_proj, total_ple, dim);
1550915510

1551015511
/* Scale by 1/sqrt(hidden_dim) */

src/engine/tq_transformer.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,17 +2342,18 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
23422342
if (model->ple_dim > 0 && model->ple_embedding && model->ple_proj && !getenv("TQ_NO_PLE")) {
23432343
int ple_dim = model->ple_dim;
23442344
int n_layers = c->n_layers;
2345-
int total_ple = n_layers * ple_dim; /* e.g., 35 * 256 = 8960 */
2345+
int total_ple = n_layers * ple_dim; /* E2B: 35*256=8960, E4B: 42*256=10752 */
23462346

23472347
/* Lazy allocation of ple_buf */
23482348
if (!s->ple_buf) {
23492349
s->ple_buf = (float*)calloc((size_t)total_ple, sizeof(float));
23502350
}
23512351

2352-
/* Step A: Dequant per_layer_token_embd[token] → temp_embd[8960]
2352+
/* Step A: Dequant per_layer_token_embd[token] → temp_embd
23532353
* The embedding tensor is [total_ple, vocab_size] in GGUF row-major,
23542354
* so one token's data is at row offset = token * row_bytes. */
2355-
float temp_embd[8960]; /* stack buffer, total_ple <= 8960 */
2355+
float temp_embd[16384]; /* stack buffer, sized for total_ple up to 16384 (E4B=10752) */
2356+
if (total_ple > 16384) return s->logits; /* safety guard — should not happen */
23562357
{
23572358
size_t type_size = tq_ggml_type_size(model->ple_embedding_type);
23582359
int blck = tq_ggml_type_blck(model->ple_embedding_type);
@@ -2368,11 +2369,11 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
23682369
temp_embd[i] *= ple_scale;
23692370
}
23702371

2371-
/* Step B: per_layer_model_proj @ embed_raw → temp_proj[8960]
2372-
* ple_proj is [total_ple, hidden_dim] FP32 (rows=8960, cols=1536).
2372+
/* Step B: per_layer_model_proj @ embed_raw → temp_proj
2373+
* ple_proj is [total_ple, hidden_dim] FP32. For E4B: [10752, 2560].
23732374
* We need: for each output row d in [0, total_ple): dot(ple_proj[d,:], s->x[:])
23742375
* Note: s->x already has the scaled embedding from above. */
2375-
float temp_proj[8960];
2376+
float temp_proj[16384];
23762377
tq_matmul(temp_proj, s->x, model->ple_proj, total_ple, dim);
23772378

23782379
/* Scale by 1/sqrt(hidden_dim) */

0 commit comments

Comments
 (0)