@@ -44,64 +44,51 @@ func ToFloat16(f32 float32) Float16 {
4444 return Float16 ((uint16 (sign ) << 15 ) | 0x7C00 | nanMant )
4545 }
4646
47- // For very small numbers that would be subnormal in float16
48- if exp32 <= 112 { // 127 - 15 (float32 bias - float16 bias)
49- // This would be subnormal or zero in float16
50- if exp32 <= 101 { // 127 - 15 - 11 (11 bits of precision)
51- // Too small to represent, flush to zero
52- if sign != 0 {
53- return NegativeZero
54- }
55- return PositiveZero
56- }
57-
58- // Subnormal in float16
59- shift := 125 - exp32 // 126 - exp32 + 1 (add implicit leading 1)
60- mant32 |= 0x00800000 // Add implicit leading 1
61- mant16 := uint16 ((mant32 >> (shift + 13 )) & 0x03FF )
62-
63- // Handle rounding
64- roundBit := (mant32 >> (shift + 12 )) & 0x1
65- stickyMask := uint32 ((1 << (shift + 12 )) - 1 )
66- stickyBit := uint32 (0 )
67- if (mant32 & stickyMask ) != 0 {
68- stickyBit = 1
69- }
70-
71- if (roundBit | stickyBit ) != 0 {
72- mant16 ++
73- // Check for carry
74- if (mant16 & 0x0400 ) != 0 {
75- mant16 = 0x0200 // 1.0 * 2^-10 (smallest normal)
76- exp32 = 0x71 // -14 + 127 (float32 bias)
77- return Float16 ((uint16 (sign ) << 15 ) | (uint16 (exp32 - 0x70 ) << 10 ) | (mant16 & 0x03FF ))
78- }
79- }
80-
81- return Float16 ((uint16 (sign ) << 15 ) | mant16 )
47+ // For subnormal float32, we need to convert to float16 subnormal
48+ if exp32 == 0 {
49+ // The value is subnormal, so we need to convert it to a float16 subnormal
50+ // or flush to zero if it's too small.
51+ // A float32 subnormal is value * 2^-126.
52+ // We need to convert it to a float16 subnormal, which is value * 2^-14.
53+ // So we need to shift the mantissa by 126 - 14 = 112 bits.
54+ // Since the float32 mantissa has 23 bits, we will lose a lot of precision.
55+ // We can approximate this by converting the float32 to float64 and then to float16.
56+ return FromFloat64 (float64 (f32 ))
8257 }
83-
8458 // Normal number in float16
85- exp16 := exp32 - 0x70 // float32 bias ( 127) - float16 bias (15) + 1 (for rounding)
86- if exp16 >= 0x1F {
59+ exp16 := exp32 - 127 + 15
60+ if exp16 >= 31 {
8761 // Overflow
8862 if sign != 0 {
8963 return NegativeInfinity
9064 }
9165 return PositiveInfinity
9266 }
9367
68+ if exp16 <= 0 {
69+ // Underflow to subnormal
70+ shift := uint (1 - exp16 )
71+ mant32 |= 0x800000 // Add implicit leading 1
72+ mant16 := uint16 (mant32 >> (shift + 13 ))
73+ // Rounding
74+ roundBit := (mant32 >> (shift + 12 )) & 1
75+ if roundBit != 0 {
76+ mant16 ++
77+ }
78+ return Float16 ((uint16 (sign ) << 15 ) | mant16 )
79+ }
80+
9481 // Extract mantissa bits (10 bits) with rounding
95- mant16 := uint16 ((mant32 + 0x00000FFF + (( mant32 >> 13 ) & 1 ) ) >> 13 )
82+ mant16 := uint16 ((mant32 + 0x1000 ) >> 13 )
9683
9784 // Check for overflow in mantissa (due to rounding)
9885 if (mant16 & 0x0400 ) != 0 {
99- mant16 >>= 1
86+ mant16 = 0
10087 exp16 ++
10188 }
10289
10390 // Check for overflow after rounding
104- if exp16 >= 0x1F {
91+ if exp16 >= 31 {
10592 if sign != 0 {
10693 return NegativeInfinity
10794 }
0 commit comments