@@ -13959,6 +13959,8 @@ static void deltanet_forward(tq_model_t* model, tq_state_t* s, int l) {
1395913959 float* K_all = s->delta_qkv + dn_kv * dk;
1396013960 float* V_all = s->delta_qkv + 2 * dn_kv * dk;
1396113961
13962+ /* L2 normalization of Q/K: REQUIRED for Qwen3.5-4B.
13963+ * Removing this causes complete output collapse. */
1396213964 for (int h = 0; h < dn_kv; h++) {
1396313965 l2_normalize(Q_all + h * dk, dk);
1396413966 l2_normalize(K_all + h * dk, dk);
@@ -13991,50 +13993,56 @@ static void deltanet_forward(tq_model_t* model, tq_state_t* s, int l) {
1399113993 float decay = decay_vals[h]; /* precomputed exp(gate) */
1399213994
1399313995#ifdef __ARM_NEON
13994- /* NEON-optimized: fused decay + sk computation.
13995- * For each row i of state: decay state, accumulate sk.
13996- * sk[j] = sum_i(S[i,j] * K[i]) after decay */
13996+ /* NEON-optimized: llama.cpp-aligned delta rule.
13997+ * Formula (matches gated_delta_net.cu):
13998+ * sk = S @ K (BEFORE decay)
13999+ * d = (V - g*sk) * beta
14000+ * S = g*S + K * d
14001+ * o = S @ Q
14002+ * The key difference from the previous impl: sk is computed
14003+ * on the ORIGINAL state, then decay is applied to both sk
14004+ * (in the delta) and S (in the update). This prevents
14005+ * short-prompt instability where early tokens have near-zero
14006+ * state and the decay-first approach loses information. */
1399714007 float* sk = s->delta_sk;
1399814008 memset(sk, 0, (size_t)dv * sizeof(float));
1399914009
14000- float32x4_t vdecay = vdupq_n_f32( decay);
14010+ /* Step A: sk = S @ K (on original state, BEFORE decay) */
1400114011 for (int i = 0; i < dk; i++) {
1400214012 float* sp = sh + i * dv;
1400314013 float ki = kh[i];
1400414014 float32x4_t vki = vdupq_n_f32(ki);
1400514015 int j = 0;
1400614016 for (; j + 3 < dv; j += 4) {
1400714017 float32x4_t vs = vld1q_f32(sp + j);
14008- vs = vmulq_f32(vs, vdecay); /* decay */
14009- vst1q_f32(sp + j, vs); /* store decayed state */
1401014018 float32x4_t vsk = vld1q_f32(sk + j);
14011- vsk = vfmaq_f32(vsk, vs, vki); /* accumulate sk */
14019+ vsk = vfmaq_f32(vsk, vs, vki);
1401214020 vst1q_f32(sk + j, vsk);
1401314021 }
1401414022 for (; j < dv; j++) {
14015- sp[j] *= decay;
1401614023 sk[j] += sp[j] * ki;
1401714024 }
1401814025 }
1401914026
14020- /* Delta : d = beta * (V - sk) */
14027+ /* Step B : d = (V - g* sk) * beta */
1402114028 float* d_vec = s->delta_dvec;
1402214029 float32x4_t vbeta = vdupq_n_f32(beta_h);
14030+ float32x4_t vdecay = vdupq_n_f32(decay);
1402314031 {
1402414032 int j = 0;
1402514033 for (; j + 3 < dv; j += 4) {
1402614034 float32x4_t vv = vld1q_f32(vh + j);
14027- float32x4_t vs = vld1q_f32(sk + j);
14028- float32x4_t vd = vmulq_f32(vbeta, vsubq_f32(vv, vs ));
14035+ float32x4_t vsk = vld1q_f32(sk + j);
14036+ float32x4_t vd = vmulq_f32(vbeta, vsubq_f32(vv, vmulq_f32(vdecay, vsk) ));
1402914037 vst1q_f32(d_vec + j, vd);
1403014038 }
1403114039 for (; j < dv; j++) {
14032- d_vec[j] = beta_h * (vh[j] - sk[j]);
14040+ d_vec[j] = beta_h * (vh[j] - decay * sk[j]);
1403314041 }
1403414042 }
1403514043
14036- /* State update : S[i][j] += K[i] * d[j] (rank-1 outer product )
14037- * + Output: o[j] = sum_i(S[i,j] * Q[i]) (simultaneously) */
14044+ /* Step C : S = g*S + K*d (state update )
14045+ * + Output: o = S @ Q (simultaneously) */
1403814046 float* oh = s->delta_out + h * dv;
1403914047 memset(oh, 0, (size_t)dv * sizeof(float));
1404014048
@@ -14047,26 +14055,24 @@ static void deltanet_forward(tq_model_t* model, tq_state_t* s, int l) {
1404714055 int j = 0;
1404814056 for (; j + 3 < dv; j += 4) {
1404914057 float32x4_t vs = vld1q_f32(sp + j);
14058+ vs = vmulq_f32(vs, vdecay); /* S = g*S */
1405014059 float32x4_t vd = vld1q_f32(d_vec + j);
14051- vs = vfmaq_f32(vs, vki, vd); /* S += K[i] * d */
14060+ vs = vfmaq_f32(vs, vki, vd); /* S += K[i] * d */
1405214061 vst1q_f32(sp + j, vs);
1405314062 float32x4_t vo = vld1q_f32(oh + j);
14054- vo = vfmaq_f32(vo, vs, vqi); /* o += S * Q[i] */
14063+ vo = vfmaq_f32(vo, vs, vqi); /* o += S * Q[i] */
1405514064 vst1q_f32(oh + j, vo);
1405614065 }
1405714066 for (; j < dv; j++) {
14058- sp[j] += ki * d_vec[j];
14067+ sp[j] = decay * sp[j] + ki * d_vec[j];
1405914068 oh[j] += sp[j] * qi;
1406014069 }
1406114070 }
1406214071#else
14063- /* Scalar fallback */
14064- /* Decay: S = S * exp(gate) */
14065- for (int i = 0; i < dk * dv; i++) {
14066- sh[i] *= decay;
14067- }
14072+ /* Scalar fallback — llama.cpp-aligned formula:
14073+ * sk = S @ K, d = (V - g*sk) * beta, S = g*S + K*d, o = S @ Q */
1406814074
14069- /* Compute sk */
14075+ /* Compute sk = S @ K (original state, before decay) */
1407014076 float* sk = s->delta_sk;
1407114077 for (int j = 0; j < dv; j++) {
1407214078 float sum = 0.0f;
@@ -14076,20 +14082,20 @@ static void deltanet_forward(tq_model_t* model, tq_state_t* s, int l) {
1407614082 sk[j] = sum;
1407714083 }
1407814084
14079- /* Delta */
14085+ /* Delta: d = (V - g*sk) * beta */
1408014086 float* d_vec = s->delta_dvec;
1408114087 for (int j = 0; j < dv; j++) {
14082- d_vec[j] = beta_h * (vh[j] - sk[j]);
14088+ d_vec[j] = beta_h * (vh[j] - decay * sk[j]);
1408314089 }
1408414090
14085- /* State update */
14091+ /* State update: S = g*S + K*d */
1408614092 for (int i = 0; i < dk; i++) {
1408714093 for (int j = 0; j < dv; j++) {
14088- sh[i * dv + j] += kh[i] * d_vec[j];
14094+ sh[i * dv + j] = decay * sh[i * dv + j] + kh[i] * d_vec[j];
1408914095 }
1409014096 }
1409114097
14092- /* Output */
14098+ /* Output: o = S @ Q */
1409314099 float* oh = s->delta_out + h * dv;
1409414100 for (int j = 0; j < dv; j++) {
1409514101 float sum = 0.0f;
@@ -16255,6 +16261,15 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1625516261 }
1625616262 }
1625716263
16264+ /* Suppress <think> token to disable thinking/reasoning mode.
16265+ * Qwen3.5 models default to thinking mode which adds many tokens
16266+ * of internal reasoning before the actual answer. By suppressing
16267+ * the <think> special token, the model goes directly to answering. */
16268+ int think_token_id = tokenizer ? str_lookup(tokenizer, "<think>") : -1;
16269+ if (think_token_id >= 0 && think_token_id < vocab_size) {
16270+ state->logits[think_token_id] = -1e30f;
16271+ }
16272+
1625816273 /* Sample first generated token. The seed is configurable via
1625916274 * config->rng_seed (default 42); 0 falls back to 42 so existing
1626016275 * callers that never set rng_seed get bit-identical behaviour. */
@@ -16271,6 +16286,7 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1627116286 int generated = 0;
1627216287 int output_pos = 0;
1627316288 int prev_token = prompt_tokens[n_prompt - 1];
16289+ int seen_nonwhitespace = 0; /* track whether we've emitted non-whitespace yet */
1627416290
1627516291 /* EOS token IDs — check common values across model families.
1627616292 * Qwen3.5: eos = 248044 (<|endoftext|>), 248046 (<|im_end|>)
@@ -16366,6 +16382,19 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1636616382 strstr(piece, "<1st>") || strstr(piece, "<2nd>") || strstr(piece, "<3rd>")) {
1636716383 piece = "";
1636816384 }
16385+ /* Skip leading whitespace-only tokens (Qwen3.5 thinking mode
16386+ * produces <think>...</think> which gets filtered, but the
16387+ * surrounding newlines remain as plain text tokens).
16388+ * Only skip before any non-whitespace content has been emitted. */
16389+ if (!seen_nonwhitespace && piece[0] != '\0') {
16390+ const char* p = piece;
16391+ while (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t') p++;
16392+ if (*p == '\0') {
16393+ piece = ""; /* all whitespace — skip */
16394+ } else {
16395+ seen_nonwhitespace = 1;
16396+ }
16397+ }
1636916398 }
1637016399 if (should_stop) break;
1637116400
@@ -16387,7 +16416,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1638716416 prev_token = next_token;
1638816417 tq_forward(model, state, next_token, pos);
1638916418 pos++;
16390- generated++;
16419+ /* Only count tokens that produced visible output toward the limit.
16420+ * Leading whitespace from thinking mode should not consume the budget. */
16421+ if (seen_nonwhitespace) {
16422+ generated++;
16423+ }
1639116424
1639216425 /* Apply repetition penalty before sampling */
1639316426 if (rep_penalty > 1.0f) {
@@ -16405,6 +16438,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
1640516438 }
1640616439 }
1640716440
16441+ /* Suppress <think> token to prevent entering thinking mode */
16442+ if (think_token_id >= 0 && think_token_id < vocab_size) {
16443+ state->logits[think_token_id] = -1e30f;
16444+ }
16445+
1640816446 /* Sample next token */
1640916447 next_token = tq_sample_topp(state->logits, vocab_size,
1641016448 config->temperature, config->top_p,
0 commit comments