Skip to content

Commit 33ab34d

Browse files
committed
fix: Correct ToFloat16 subnormal conversion
1 parent 38c084e commit 33ab34d

1 file changed

Lines changed: 28 additions & 41 deletions

File tree

convert.go

Lines changed: 28 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -44,64 +44,51 @@ func ToFloat16(f32 float32) Float16 {
4444
return Float16((uint16(sign) << 15) | 0x7C00 | nanMant)
4545
}
4646

47-
// For very small numbers that would be subnormal in float16
48-
if exp32 <= 112 { // 127 - 15 (float32 bias - float16 bias)
49-
// This would be subnormal or zero in float16
50-
if exp32 <= 101 { // 127 - 15 - 11 (11 bits of precision)
51-
// Too small to represent, flush to zero
52-
if sign != 0 {
53-
return NegativeZero
54-
}
55-
return PositiveZero
56-
}
57-
58-
// Subnormal in float16
59-
shift := 125 - exp32 // 126 - exp32 + 1 (add implicit leading 1)
60-
mant32 |= 0x00800000 // Add implicit leading 1
61-
mant16 := uint16((mant32 >> (shift + 13)) & 0x03FF)
62-
63-
// Handle rounding
64-
roundBit := (mant32 >> (shift + 12)) & 0x1
65-
stickyMask := uint32((1 << (shift + 12)) - 1)
66-
stickyBit := uint32(0)
67-
if (mant32 & stickyMask) != 0 {
68-
stickyBit = 1
69-
}
70-
71-
if (roundBit | stickyBit) != 0 {
72-
mant16++
73-
// Check for carry
74-
if (mant16 & 0x0400) != 0 {
75-
mant16 = 0x0200 // 1.0 * 2^-10 (smallest normal)
76-
exp32 = 0x71 // -14 + 127 (float32 bias)
77-
return Float16((uint16(sign) << 15) | (uint16(exp32-0x70) << 10) | (mant16 & 0x03FF))
78-
}
79-
}
80-
81-
return Float16((uint16(sign) << 15) | mant16)
47+
// For subnormal float32, we need to convert to float16 subnormal
48+
if exp32 == 0 {
49+
// The value is subnormal, so we need to convert it to a float16 subnormal
50+
// or flush to zero if it's too small.
51+
// A float32 subnormal is value * 2^-126.
52+
// We need to convert it to a float16 subnormal, which is value * 2^-14.
53+
// So we need to shift the mantissa by 126 - 14 = 112 bits.
54+
// Since the float32 mantissa has 23 bits, we will lose a lot of precision.
55+
// We can approximate this by converting the float32 to float64 and then to float16.
56+
return FromFloat64(float64(f32))
8257
}
83-
8458
// Normal number in float16
85-
exp16 := exp32 - 0x70 // float32 bias (127) - float16 bias (15) + 1 (for rounding)
86-
if exp16 >= 0x1F {
59+
exp16 := exp32 - 127 + 15
60+
if exp16 >= 31 {
8761
// Overflow
8862
if sign != 0 {
8963
return NegativeInfinity
9064
}
9165
return PositiveInfinity
9266
}
9367

68+
if exp16 <= 0 {
69+
// Underflow to subnormal
70+
shift := uint(1 - exp16)
71+
mant32 |= 0x800000 // Add implicit leading 1
72+
mant16 := uint16(mant32 >> (shift + 13))
73+
// Rounding
74+
roundBit := (mant32 >> (shift + 12)) & 1
75+
if roundBit != 0 {
76+
mant16++
77+
}
78+
return Float16((uint16(sign) << 15) | mant16)
79+
}
80+
9481
// Extract mantissa bits (10 bits) with rounding
95-
mant16 := uint16((mant32 + 0x00000FFF + ((mant32 >> 13) & 1)) >> 13)
82+
mant16 := uint16((mant32 + 0x1000) >> 13)
9683

9784
// Check for overflow in mantissa (due to rounding)
9885
if (mant16 & 0x0400) != 0 {
99-
mant16 >>= 1
86+
mant16 = 0
10087
exp16++
10188
}
10289

10390
// Check for overflow after rounding
104-
if exp16 >= 0x1F {
91+
if exp16 >= 31 {
10592
if sign != 0 {
10693
return NegativeInfinity
10794
}

0 commit comments

Comments
 (0)