|
1 | 1 | package float16 |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "fmt" |
4 | 5 | "math" |
5 | | - |
6 | | - "github.com/x448/float16" |
| 6 | + "strconv" |
7 | 7 | ) |
8 | 8 |
|
9 | | -// Converter holds the conversion and rounding modes for float16 operations. |
10 | | -type Converter struct { |
11 | | - ConversionMode ConversionMode |
12 | | - RoundingMode RoundingMode |
13 | | -} |
14 | | - |
15 | | -// NewConverter creates a new Converter with the specified modes. |
16 | | -func NewConverter(convMode ConversionMode, roundMode RoundingMode) *Converter { |
17 | | - return &Converter{ |
18 | | - ConversionMode: convMode, |
19 | | - RoundingMode: roundMode, |
20 | | - } |
21 | | -} |
22 | | - |
23 | | -// ToFloat16 converts a float32 value to Float16 format using the Converter's settings. |
24 | | -func (c *Converter) ToFloat16(f32 float32) Float16 { |
25 | | - return Float16(float16.Fromfloat32(f32).Bits()) |
26 | | -} |
27 | | - |
28 | | -// ToFloat16 converts a float32 to Float16 with default conversion and rounding modes |
29 | | -func ToFloat16(f32 float32) Float16 { |
30 | | - return NewConverter(DefaultConversionMode, DefaultRoundingMode).ToFloat16(f32) |
31 | | -} |
32 | | - |
33 | | -// ToFloat16WithMode converts a float32 to Float16 with specified conversion and rounding modes |
34 | | -func (c *Converter) ToFloat16WithMode(f32 float32) (Float16, error) { |
35 | | - convMode := c.ConversionMode |
36 | | - if convMode == ModeStrict { |
37 | | - if math.IsInf(float64(f32), 0) { |
38 | | - return 0, &Float16Error{Code: ErrInfinity} |
| 9 | +// FromFloat32 converts a float32 value to a Float16 value. |
| 10 | +// It handles special cases like NaN, infinities, and zeros. |
| 11 | +// The conversion follows IEEE 754-2008 rules for half-precision. |
| 12 | +func FromFloat32(f32 float32) Float16 { |
| 13 | + f32Bits := math.Float32bits(f32) |
| 14 | + sign := f32Bits & 0x80000000 |
| 15 | + exp := (f32Bits >> 23) & 0xFF |
| 16 | + mant := f32Bits & 0x7FFFFF |
| 17 | + |
| 18 | + var f16Bits uint16 |
| 19 | + |
| 20 | + if exp == 0xFF { // NaN or Infinity |
| 21 | + if mant != 0 { // NaN |
| 22 | + f16Bits = 0x7E00 // Quiet NaN |
| 23 | + } else { // Infinity |
| 24 | + if sign != 0 { |
| 25 | + f16Bits = 0xFC00 // Negative Infinity |
| 26 | + } else { |
| 27 | + f16Bits = 0x7C00 // Positive Infinity |
| 28 | + } |
39 | 29 | } |
40 | | - if math.IsNaN(float64(f32)) { |
41 | | - return 0, &Float16Error{Code: ErrNaN} |
| 30 | + } else if exp == 0 { // Zero or Denormalized |
| 31 | + if mant == 0 { // Zero |
| 32 | + if sign != 0 { |
| 33 | + f16Bits = 0x8000 // Negative Zero |
| 34 | + } else { |
| 35 | + f16Bits = 0x0000 // Positive Zero |
| 36 | + } |
| 37 | + } else { // Denormalized float32, convert to float16 denormalized or zero |
| 38 | + // Shift mantissa right to align with float16 denormalized range |
| 39 | + // This is a simplified approach and might not be perfectly accurate for all denormals |
| 40 | + // A more robust implementation would involve proper rounding and handling of underflow |
| 41 | + f16Bits = uint16(mant >> 13) // Shift 23 - 10 = 13 bits |
| 42 | + if sign != 0 { |
| 43 | + f16Bits |= 0x8000 |
| 44 | + } |
42 | 45 | } |
43 | | - if f32 > 65504.0 || f32 < -65504.0 { |
44 | | - return 0, &Float16Error{Code: ErrOverflow} |
45 | | - } |
46 | | - if f32 != 0 && math.Abs(float64(f32)) < 6.103515625e-05 { |
47 | | - return 0, &Float16Error{Code: ErrUnderflow} |
| 46 | + } else { // Normalized float32 |
| 47 | + exp16 := int(exp) - 127 + 15 // Adjust bias |
| 48 | + if exp16 >= 31 { // Overflow, convert to infinity |
| 49 | + if sign != 0 { |
| 50 | + f16Bits = 0xFC00 // Negative Infinity |
| 51 | + } else { |
| 52 | + f16Bits = 0x7C00 // Positive Infinity |
| 53 | + } |
| 54 | + } else if exp16 <= 0 { // Underflow, convert to denormalized or zero |
| 55 | + // This is a simplified approach. Proper denormalization requires |
| 56 | + // shifting the mantissa and potentially losing precision. |
| 57 | + if exp16 == 0 { // Smallest normalized float32 maps to float16 denormal |
| 58 | + f16Bits = uint16(mant >> 13) // Shift 23 - 10 = 13 bits |
| 59 | + } else { // Smaller than smallest denormal, convert to zero |
| 60 | + f16Bits = 0x0000 |
| 61 | + } |
| 62 | + if sign != 0 { |
| 63 | + f16Bits |= 0x8000 |
| 64 | + } |
| 65 | + } else { // Normalized float16 |
| 66 | + f16Bits = uint16(exp16<<10) | uint16(mant>>13) // Shift 23 - 10 = 13 bits |
| 67 | + if sign != 0 { |
| 68 | + f16Bits |= 0x8000 |
| 69 | + } |
48 | 70 | } |
49 | 71 | } |
50 | | - |
51 | | - f16 := float16.Fromfloat32(f32) |
52 | | - return Float16(f16.Bits()), nil |
| 72 | + return Float16(f16Bits) |
53 | 73 | } |
54 | 74 |
|
55 | | -// ToFloat32 converts a Float16 value to float32 with full precision |
| 75 | +// ToFloat32 converts a Float16 value to a float32 value. |
| 76 | +// It handles special cases like NaN, infinities, and zeros. |
56 | 77 | func (f Float16) ToFloat32() float32 { |
57 | | - return float16.Frombits(uint16(f)).Float32() |
58 | | -} |
59 | | - |
60 | | -// ToFloat64 converts a Float16 value to float64 with full precision |
61 | | -func (f Float16) ToFloat64() float64 { |
62 | | - return float64(f.ToFloat32()) |
63 | | -} |
64 | | - |
65 | | -// FromFloat32 converts a float32 to Float16 (with potential precision loss) |
66 | | -func (c *Converter) FromFloat32(f32 float32) Float16 { |
67 | | - return c.ToFloat16(f32) |
68 | | -} |
69 | | - |
70 | | -// FromFloat64 converts a float64 to Float16 (with potential precision loss) |
71 | | -func (c *Converter) FromFloat64(f64 float64) Float16 { |
72 | | - return c.ToFloat16(float32(f64)) |
73 | | -} |
74 | | - |
75 | | -// FromFloat64WithMode converts a float64 to Float16 with specified modes |
76 | | -func FromFloat64WithMode(f64 float64, convMode ConversionMode, roundMode RoundingMode) (Float16, error) { |
77 | | - if convMode == ModeStrict { |
78 | | - if math.IsInf(f64, 0) { |
79 | | - return 0, &Float16Error{Code: ErrInfinity} |
80 | | - } |
81 | | - if math.IsNaN(f64) { |
82 | | - return 0, &Float16Error{Code: ErrNaN} |
| 78 | + f16Bits := uint16(f) |
| 79 | + sign := uint32(f16Bits & 0x8000) << 16 // Shift to float32 sign position |
| 80 | + exp := (f16Bits >> 10) & 0x1F |
| 81 | + mant := f16Bits & 0x3FF |
| 82 | + |
| 83 | + var f32Bits uint32 |
| 84 | + |
| 85 | + if exp == 0x1F { // NaN or Infinity |
| 86 | + if mant != 0 { // NaN |
| 87 | + f32Bits = 0x7FC00000 // Quiet NaN |
| 88 | + } else { // Infinity |
| 89 | + if sign != 0 { |
| 90 | + f32Bits = 0xFF800000 // Negative Infinity |
| 91 | + } else { |
| 92 | + f32Bits = 0x7F800000 // Positive Infinity |
| 93 | + } |
83 | 94 | } |
84 | | - if f64 > 65504.0 || f64 < -65504.0 { |
85 | | - return 0, &Float16Error{Code: ErrOverflow} |
| 95 | + } else if exp == 0 { // Zero or Denormalized |
| 96 | + if mant == 0 { // Zero |
| 97 | + if sign != 0 { |
| 98 | + f32Bits = 0x80000000 // Negative Zero |
| 99 | + } else { |
| 100 | + f32Bits = 0x00000000 // Positive Zero |
| 101 | + } |
| 102 | + } else { // Denormalized float16, convert to float32 denormalized |
| 103 | + // Shift mantissa left to align with float32 denormalized range |
| 104 | + // This is a simplified approach and might not be perfectly accurate for all denormals |
| 105 | + // A more robust implementation would involve proper scaling |
| 106 | + f32Bits = uint32(mant) << 13 // Shift 10 + 13 = 23 bits |
| 107 | + if sign != 0 { |
| 108 | + f32Bits |= 0x80000000 |
| 109 | + } |
86 | 110 | } |
87 | | - if f64 != 0 && math.Abs(f64) < 6.103515625e-05 { |
88 | | - return 0, &Float16Error{Code: ErrUnderflow} |
89 | | - } |
90 | | - } |
91 | | - |
92 | | - return NewConverter(convMode, roundMode).ToFloat16WithMode(float32(f64)) |
93 | | -} |
94 | | - |
95 | | -// ToSlice16 converts a slice of float32 to Float16 with optimized performance |
96 | | -func (c *Converter) ToSlice16(f32s []float32) []Float16 { |
97 | | - if len(f32s) == 0 { |
98 | | - return nil |
99 | | - } |
100 | | - res := make([]Float16, len(f32s)) |
101 | | - for i, f := range f32s { |
102 | | - res[i] = c.ToFloat16(f) |
103 | | - } |
104 | | - return res |
105 | | -} |
106 | | - |
107 | | -// ToSlice16 converts a slice of float32 to Float16 with default conversion and rounding modes |
108 | | -func ToSlice16(f32s []float32) []Float16 { |
109 | | - return NewConverter(DefaultConversionMode, DefaultRoundingMode).ToSlice16(f32s) |
110 | | -} |
111 | | - |
112 | | -// ToSlice32 converts a slice of Float16 to float32 with optimized performance |
113 | | -func ToSlice32(f16s []Float16) []float32 { |
114 | | - if len(f16s) == 0 { |
115 | | - return nil |
116 | | - } |
117 | | - res := make([]float32, len(f16s)) |
118 | | - for i, f := range f16s { |
119 | | - res[i] = f.ToFloat32() |
120 | | - } |
121 | | - return res |
122 | | -} |
123 | | - |
124 | | -// ToSlice64 converts a slice of Float16 to float64 with optimized performance |
125 | | -func ToSlice64(f16s []Float16) []float64 { |
126 | | - if len(f16s) == 0 { |
127 | | - return nil |
128 | | - } |
129 | | - res := make([]float64, len(f16s)) |
130 | | - for i, f := range f16s { |
131 | | - res[i] = f.ToFloat64() |
| 111 | + } else { // Normalized float16 |
| 112 | + exp32 := uint32(int(exp) - 15 + 127) // Adjust bias |
| 113 | + f32Bits = sign | (exp32 << 23) | (uint32(mant) << 13) // Shift 10 + 13 = 23 bits |
132 | 114 | } |
133 | | - return res |
| 115 | + return math.Float32frombits(f32Bits) |
134 | 116 | } |
135 | 117 |
|
136 | | -// FromSlice64 converts a slice of float64 to Float16 with optimized performance |
137 | | -func (c *Converter) FromSlice64(f64s []float64) []Float16 { |
138 | | - if len(f64s) == 0 { |
139 | | - return nil |
140 | | - } |
141 | | - res := make([]Float16, len(f64s)) |
142 | | - for i, f := range f64s { |
143 | | - res[i] = c.FromFloat64(f) |
144 | | - } |
145 | | - return res |
| 118 | +// FromFloat64 converts a float64 value to a Float16 value. |
| 119 | +// It handles special cases like NaN, infinities, and zeros. |
| 120 | +func FromFloat64(f64 float64) Float16 { |
| 121 | + return FromFloat32(float32(f64)) // Simplified: convert via float32 |
146 | 122 | } |
147 | 123 |
|
148 | | -// ToSlice16WithMode converts a slice with specified conversion mode |
149 | | -func (c *Converter) ToSlice16WithMode(f32s []float32) ([]Float16, []error) { |
150 | | - if len(f32s) == 0 { |
151 | | - return nil, nil |
152 | | - } |
153 | | - res := make([]Float16, len(f32s)) |
154 | | - errs := []error{} |
155 | | - for i, f := range f32s { |
156 | | - r, err := c.ToFloat16WithMode(f) |
157 | | - if err != nil { |
158 | | - errs = append(errs, err) |
159 | | - } |
160 | | - res[i] = r |
| 124 | +// ToFloat64 converts a Float16 value to a float64 value. |
| 125 | +// It handles special cases like NaN, infinities, and zeros. |
| 126 | +func (f Float16) ToFloat64() float64 { |
| 127 | + return float64(f.ToFloat32()) // Simplified: convert via float32 |
| 128 | +} |
| 129 | + |
| 130 | +// FromBits creates a Float16 from its raw uint16 bit representation. |
| 131 | +func FromBits(bits uint16) Float16 { |
| 132 | + return Float16(bits) |
| 133 | +} |
| 134 | + |
| 135 | +// Bits returns the raw uint16 bit representation of a Float16. |
| 136 | +func (f Float16) Bits() uint16 { |
| 137 | + return uint16(f) |
| 138 | +} |
| 139 | + |
| 140 | +// ParseFloat converts a string to a Float16 value. |
| 141 | +// The precision parameter is ignored for Float16. |
| 142 | +// It returns the Float16 value and an error if the string cannot be parsed. |
| 143 | +func ParseFloat(s string, precision int) (Float16, error) { |
| 144 | + // This implementation is a placeholder and does not fully parse |
| 145 | + // a string to a float16. It only handles basic cases. |
| 146 | + // A full implementation would require a more complex parser. |
| 147 | + |
| 148 | + switch s { |
| 149 | + case "NaN": |
| 150 | + return NaN(), nil |
| 151 | + case "+Inf", "Inf": |
| 152 | + return PositiveInfinity, nil |
| 153 | + case "-Inf": |
| 154 | + return NegativeInfinity, nil |
| 155 | + case "+0", "0": |
| 156 | + return PositiveZero, nil |
| 157 | + case "-0": |
| 158 | + return NegativeZero, nil |
161 | 159 | } |
162 | | - return res, errs |
163 | | -} |
164 | | - |
165 | | -// Integer conversion functions |
166 | | - |
167 | | -// FromInt converts an integer to Float16 |
168 | | -func (c *Converter) FromInt(i int) Float16 { |
169 | | - return c.ToFloat16(float32(i)) |
170 | | -} |
171 | | - |
172 | | -// FromInt converts an integer to Float16 with default conversion and rounding modes |
173 | | -func FromInt(i int) Float16 { |
174 | | - return NewConverter(DefaultConversionMode, DefaultRoundingMode).FromInt(i) |
175 | | -} |
176 | | - |
177 | | -// FromInt32 converts an int32 to Float16 |
178 | | -func (c *Converter) FromInt32(i int32) Float16 { |
179 | | - return c.ToFloat16(float32(i)) |
180 | | -} |
181 | | - |
182 | | -// FromInt64 converts an int64 to Float16 (with potential precision loss) |
183 | | -func (c *Converter) FromInt64(i int64) Float16 { |
184 | | - return c.ToFloat16(float32(i)) |
185 | | -} |
186 | | - |
187 | | -// ToInt converts a Float16 to int (truncated toward zero) |
188 | | -func (f Float16) ToInt() int { |
189 | | - return int(f.ToFloat32()) |
190 | | -} |
191 | | - |
192 | | -// ToInt32 converts a Float16 to int32 (truncated toward zero) |
193 | | -func (f Float16) ToInt32() int32 { |
194 | | - return int32(f.ToFloat32()) |
195 | | -} |
196 | | - |
197 | | -// ToInt64 converts a Float16 to int64 (truncated toward zero) |
198 | | -func (f Float16) ToInt64() int64 { |
199 | | - return int64(f.ToFloat32()) |
200 | | -} |
201 | 160 |
|
202 | | -// Parse converts a string to Float16 (placeholder for future implementation) |
203 | | -func (c *Converter) Parse(s string) (Float16, error) { |
204 | | - // This would implement string parsing - simplified for now |
205 | | - // In a full implementation, this would parse various float formats |
206 | | - return PositiveZero, &Float16Error{ |
207 | | - Op: "parse", |
208 | | - Msg: "string parsing not implemented", |
209 | | - Code: ErrInvalidOperation, |
| 161 | + // Attempt to parse as float32 and convert |
| 162 | + f32, err := strconv.ParseFloat(s, 32) |
| 163 | + if err != nil { |
| 164 | + return 0, err |
210 | 165 | } |
| 166 | + return FromFloat32(float32(f32)), nil |
211 | 167 | } |
212 | 168 |
|
213 | | -// Parse converts a string to Float16 with default conversion and rounding modes |
214 | | -func Parse(s string) (Float16, error) { |
215 | | - return NewConverter(DefaultConversionMode, DefaultRoundingMode).Parse(s) |
216 | | -} |
217 | | -func (c *Converter) shouldRound(mantissa uint32, shift int, sign uint16) bool { |
218 | | - switch c.RoundingMode { |
219 | | - case RoundNearestEven: |
220 | | - // If the value is exactly halfway, round to the nearest even number. |
221 | | - if mantissa&(1<<uint(shift-1)) != 0 && mantissa&((1<<uint(shift-1))-1) == 0 { |
222 | | - return (mantissa>>uint(shift))&1 != 0 |
223 | | - } |
224 | | - // Otherwise, round to the nearest number. |
225 | | - return mantissa&(1<<uint(shift-1)) != 0 |
226 | | - case RoundNearestAway: |
227 | | - return mantissa&(1<<uint(shift-1)) != 0 |
228 | | - case RoundTowardZero: |
229 | | - return false |
230 | | - case RoundTowardPositive: |
231 | | - return sign == 0 && mantissa&((1<<uint(shift))-1) != 0 |
232 | | - case RoundTowardNegative: |
233 | | - return sign != 0 && mantissa&((1<<uint(shift))-1) != 0 |
234 | | - } |
235 | | - return false |
236 | | -} |
237 | 169 |
|
238 | | -func shouldRound(mantissa uint32, shift int, sign uint16) bool { |
239 | | - return NewConverter(DefaultConversionMode, DefaultRoundingMode).shouldRound(mantissa, shift, sign) |
240 | | -} |
0 commit comments