Skip to content

Commit d0d3a82

Browse files
committed
feat(tensor): add QuantizeQ4K for float32 to Q4_K quantization
Asymmetric Q4_K quantization with per-sub-block 6-bit scales and mins. Used for Q5_0/Q6_K → Q4_K re-quantization in the GGUF loader to route all weight tensors through the fast Q4_K GEMV path.
1 parent d456c39 commit d0d3a82

1 file changed

Lines changed: 130 additions & 0 deletions

File tree

tensor/quantized_kquant.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,136 @@ func (q *Q4KStorage) DequantizeSubBlock(blkIdx, subIdx int, dst []float32) {
102102
}
103103
}
104104

105+
// QuantizeQ4K quantizes float32 values into Q4_K format.
106+
// Q4_K uses asymmetric quantization with per-sub-block 6-bit scales and mins,
107+
// shared fp16 super-block scale and dmin. 256 values per super-block, 144 bytes.
108+
func QuantizeQ4K(src []float32) *Q4KStorage {
109+
n := len(src)
110+
nBlocks := (n + q4KSuperBlockSize - 1) / q4KSuperBlockSize
111+
raw := make([]byte, nBlocks*q4KBlockBytes)
112+
113+
for bi := range nBlocks {
114+
off := bi * q4KSuperBlockSize
115+
var values [q4KSuperBlockSize]float32
116+
end := off + q4KSuperBlockSize
117+
if end > n {
118+
end = n
119+
}
120+
copy(values[:], src[off:end])
121+
122+
// Compute per-sub-block scale and min.
123+
var subScales, subMins [q4KNumSubBlocks]float32
124+
for sb := range q4KNumSubBlocks {
125+
sOff := sb * q4KSubBlockSize
126+
minVal, maxVal := values[sOff], values[sOff]
127+
for j := 1; j < q4KSubBlockSize; j++ {
128+
v := values[sOff+j]
129+
if v < minVal {
130+
minVal = v
131+
}
132+
if v > maxVal {
133+
maxVal = v
134+
}
135+
}
136+
if minVal > 0 {
137+
minVal = 0
138+
}
139+
subScales[sb] = (maxVal - minVal) / 15.0
140+
subMins[sb] = -minVal
141+
}
142+
143+
// Super-block scale and dmin from max sub-block values.
144+
var maxScale, maxMin float32
145+
for sb := range q4KNumSubBlocks {
146+
if subScales[sb] > maxScale {
147+
maxScale = subScales[sb]
148+
}
149+
if subMins[sb] > maxMin {
150+
maxMin = subMins[sb]
151+
}
152+
}
153+
154+
d := maxScale / 63.0
155+
dmin := maxMin / 63.0
156+
157+
// Quantize sub-block scales and mins to 6-bit.
158+
var scalesQ, minsQ [q4KNumSubBlocks]uint8
159+
for sb := range q4KNumSubBlocks {
160+
if d > 0 {
161+
v := int(float64(subScales[sb]/d) + 0.5)
162+
if v > 63 {
163+
v = 63
164+
}
165+
scalesQ[sb] = uint8(v)
166+
}
167+
if dmin > 0 {
168+
v := int(float64(subMins[sb]/dmin) + 0.5)
169+
if v > 63 {
170+
v = 63
171+
}
172+
minsQ[sb] = uint8(v)
173+
}
174+
}
175+
176+
blk := raw[bi*q4KBlockBytes : (bi+1)*q4KBlockBytes]
177+
178+
// fp16 d and dmin.
179+
dFP16 := float16.FromFloat32(d)
180+
dminFP16 := float16.FromFloat32(dmin)
181+
binary.LittleEndian.PutUint16(blk[0:2], dFP16.Bits())
182+
binary.LittleEndian.PutUint16(blk[2:4], dminFP16.Bits())
183+
184+
// Pack 6-bit scales and mins into 12 bytes at blk[4:16].
185+
for i := range 4 {
186+
blk[4+i] = (scalesQ[i] & 63) | ((scalesQ[4+i] >> 4) << 6)
187+
blk[8+i] = (minsQ[i] & 63) | ((minsQ[4+i] >> 4) << 6)
188+
}
189+
for i := range 4 {
190+
blk[12+i] = (scalesQ[4+i] & 0xF) | ((minsQ[4+i] & 0xF) << 4)
191+
}
192+
193+
// Quantize values to 4-bit per sub-block pair.
194+
dRT := dFP16.ToFloat32()
195+
dminRT := dminFP16.ToFloat32()
196+
for group := range 4 {
197+
sb0, sb1 := group*2, group*2+1
198+
sc0 := dRT * float32(scalesQ[sb0])
199+
mn0 := dminRT * float32(minsQ[sb0])
200+
sc1 := dRT * float32(scalesQ[sb1])
201+
mn1 := dminRT * float32(minsQ[sb1])
202+
203+
var invScale0, invScale1 float32
204+
if sc0 > 0 {
205+
invScale0 = 1.0 / sc0
206+
}
207+
if sc1 > 0 {
208+
invScale1 = 1.0 / sc1
209+
}
210+
211+
baseOut := group * 64
212+
baseQ := group * 32
213+
for l := range 32 {
214+
v0 := values[baseOut+l]
215+
v1 := values[baseOut+l+32]
216+
q0 := int(float64((v0+mn0)*invScale0) + 0.5)
217+
q1 := int(float64((v1+mn1)*invScale1) + 0.5)
218+
if q0 < 0 {
219+
q0 = 0
220+
} else if q0 > 15 {
221+
q0 = 15
222+
}
223+
if q1 < 0 {
224+
q1 = 0
225+
} else if q1 > 15 {
226+
q1 = 15
227+
}
228+
blk[16+baseQ+l] = byte(q0) | (byte(q1) << 4)
229+
}
230+
}
231+
}
232+
return &Q4KStorage{raw: raw, len: n}
233+
}
234+
105235
// Q4KStorage holds Q4_K quantized tensor data on CPU.
106236
type Q4KStorage struct {
107237
raw []byte // raw super-block data

0 commit comments

Comments
 (0)