@@ -2,12 +2,72 @@ package float16
22
33import (
44 "fmt"
5- "math/bits"
5+ )
6+
7+ // ErrorCode represents specific error categories for float16 operations
8+ type ErrorCode int
9+
10+ const (
11+ ErrInvalidOperation ErrorCode = iota
12+ ErrNaN
13+ ErrInfinity
14+ ErrOverflow
15+ ErrUnderflow
16+ ErrDivisionByZero
17+ )
18+
19+ // Float16Error provides detailed error information for float16 operations
20+ type Float16Error struct {
21+ Op string
22+ Msg string
23+ Code ErrorCode
24+ }
25+
26+ func (e * Float16Error ) Error () string {
27+ if e == nil {
28+ return "<nil>"
29+ }
30+ if e .Op != "" {
31+ return fmt .Sprintf ("float16 %s: %s" , e .Op , e .Msg )
32+ }
33+ return "float16: " + e .Msg
34+ }
35+
36+ // RoundingMode controls how results are rounded during conversion/arithmetic
37+ type RoundingMode int
38+
39+ const (
40+ // Round to nearest, ties to even
41+ RoundNearestEven RoundingMode = iota
42+ // Round toward zero (truncate)
43+ RoundTowardZero
44+ // Round toward +Inf
45+ RoundTowardPositive
46+ // Round toward -Inf
47+ RoundTowardNegative
48+ // Round to nearest, ties away from zero
49+ RoundNearestAway
50+ )
51+
52+ // ConversionMode controls error reporting behavior for conversions
53+ type ConversionMode int
54+
55+ const (
56+ // ModeIEEE performs IEEE-style conversion, saturating to Inf/0 with no errors
57+ ModeIEEE ConversionMode = iota
58+ // ModeStrict reports errors for NaN, Inf, overflow, and underflow
59+ ModeStrict
660)
761
862// Float16 represents a 16-bit IEEE 754 half-precision floating-point value
963type Float16 uint16
1064
65+ // Bits returns the IEEE 754 half-precision bit pattern of f
66+ func (f Float16 ) Bits () uint16 { return uint16 (f ) }
67+
68+ // FromBits constructs a Float16 from its IEEE 754 half-precision bit pattern
69+ func FromBits (b uint16 ) Float16 { return Float16 (b ) }
70+
1171// IEEE 754 half-precision format constants
1272const (
1373 SignMask = 0x8000 // 0b1000000000000000 - Sign bit mask
@@ -31,10 +91,6 @@ const (
3191 Float32ExponentLen = 8 // Float32 exponent bits
3292 Float32MantissaLen = 23 // Float32 mantissa bits
3393
34- // Float64 constants for conversion
35- Float64ExponentBias = 1023 // IEEE 754 double precision bias
36- Float64MantissaLen = 52 // Float64 mantissa bits
37-
3894 // Special exponent values
3995 ExponentZero = 0 // Zero and subnormal numbers
4096 ExponentInfinity = 31 // Infinity and NaN
@@ -66,73 +122,6 @@ const (
66122 NegativeQNaN Float16 = 0xFE00 // Negative quiet NaN
67123)
68124
69- // ConversionMode defines how conversions handle edge cases
70- type ConversionMode int
71-
72- const (
73- // ModeIEEE uses standard IEEE 754 rounding and special value behavior
74- ModeIEEE ConversionMode = iota
75- // ModeStrict returns errors for overflow, underflow, and NaN
76- ModeStrict
77- // ModeFast optimizes for performance, may sacrifice some precision
78- ModeFast
79- // ModeExact preserves exact values when possible, errors on precision loss
80- ModeExact
81- )
82-
83- // RoundingMode defines IEEE 754 rounding behavior
84- type RoundingMode int
85-
86- const (
87- // RoundNearestEven rounds to nearest, ties to even (IEEE default)
88- RoundNearestEven RoundingMode = iota
89- // RoundNearestAway rounds to nearest, ties away from zero
90- RoundNearestAway
91- // RoundTowardZero truncates toward zero
92- RoundTowardZero
93- // RoundTowardPositive rounds toward +∞
94- RoundTowardPositive
95- // RoundTowardNegative rounds toward -∞
96- RoundTowardNegative
97- )
98-
99- // Float16Error represents errors that can occur during Float16 operations
100- type Float16Error struct {
101- Op string // Operation that caused the error
102- Value interface {} // Input value that caused the error
103- Msg string // Error message
104- Code ErrorCode // Specific error code
105- }
106-
107- // ErrorCode represents specific error types
108- type ErrorCode int
109-
110- const (
111- ErrOverflow ErrorCode = iota
112- ErrUnderflow
113- ErrInvalidOperation
114- ErrDivisionByZero
115- ErrInexact
116- ErrNaN
117- ErrInfinity
118- )
119-
120- func (e * Float16Error ) Error () string {
121- if e .Value != nil {
122- return fmt .Sprintf ("float16.%s: %s (value: %v)" , e .Op , e .Msg , e .Value )
123- }
124- return fmt .Sprintf ("float16.%s: %s" , e .Op , e .Msg )
125- }
126-
127- // Predefined error instances
128- var (
129- ErrOverflowError = & Float16Error {Code : ErrOverflow , Msg : "value too large for float16" }
130- ErrUnderflowError = & Float16Error {Code : ErrUnderflow , Msg : "value too small for float16" }
131- ErrNaNError = & Float16Error {Code : ErrNaN , Msg : "NaN in strict mode" }
132- ErrInfinityError = & Float16Error {Code : ErrInfinity , Msg : "infinity in strict mode" }
133- ErrDivByZeroError = & Float16Error {Code : ErrDivisionByZero , Msg : "division by zero" }
134- )
135-
136125// IsZero returns true if the Float16 value represents zero (positive or negative)
137126func (f Float16 ) IsZero () bool {
138127 return (f & 0x7FFF ) == 0
@@ -178,6 +167,61 @@ func (f Float16) IsSubnormal() bool {
178167 return exp == ExponentZero && mant != 0
179168}
180169
170+ // FloatClass enumerates the IEEE 754 classification of a Float16 value
171+ type FloatClass int
172+
173+ const (
174+ ClassPositiveZero FloatClass = iota
175+ ClassNegativeZero
176+ ClassPositiveSubnormal
177+ ClassNegativeSubnormal
178+ ClassPositiveNormal
179+ ClassNegativeNormal
180+ ClassPositiveInfinity
181+ ClassNegativeInfinity
182+ ClassQuietNaN
183+ ClassSignalingNaN
184+ )
185+
186+ // Class returns the IEEE 754 classification of the value
187+ func (f Float16 ) Class () FloatClass {
188+ bits := uint16 (f )
189+ sign := (bits & SignMask ) != 0
190+ exp := (bits & ExponentMask ) >> MantissaLen
191+ mant := bits & MantissaMask
192+
193+ switch exp {
194+ case ExponentZero :
195+ if mant == 0 {
196+ if sign {
197+ return ClassNegativeZero
198+ }
199+ return ClassPositiveZero
200+ }
201+ if sign {
202+ return ClassNegativeSubnormal
203+ }
204+ return ClassPositiveSubnormal
205+ case ExponentInfinity :
206+ if mant == 0 {
207+ if sign {
208+ return ClassNegativeInfinity
209+ }
210+ return ClassPositiveInfinity
211+ }
212+ // NaN: distinguish quiet vs signaling by top mantissa bit (bit 9)
213+ if (mant & (1 << (MantissaLen - 1 ))) != 0 {
214+ return ClassQuietNaN
215+ }
216+ return ClassSignalingNaN
217+ default :
218+ if sign {
219+ return ClassNegativeNormal
220+ }
221+ return ClassPositiveNormal
222+ }
223+ }
224+
181225// Sign returns the sign of the Float16 value: 1 for positive, -1 for negative, 0 for zero
182226func (f Float16 ) Sign () int {
183227 if f .IsZero () {
@@ -204,19 +248,15 @@ func (f Float16) Neg() Float16 {
204248 return f ^ SignMask // Flip sign bit
205249}
206250
207- // CopySign returns a Float16 with the magnitude of f and the sign of sign
208- func (f Float16 ) CopySign (sign Float16 ) Float16 {
209- return (f & 0x7FFF ) | (sign & SignMask )
210- }
211-
212- // Bits returns the underlying uint16 representation
213- func (f Float16 ) Bits () uint16 {
214- return uint16 (f )
251+ // CopySign returns a value with the magnitude of f and the sign of s
252+ func (f Float16 ) CopySign (s Float16 ) Float16 {
253+ // Clear sign bit of f, then OR with sign bit of s
254+ return (f & ^ Float16 (SignMask )) | (s & Float16 (SignMask ))
215255}
216256
217- // FromBits creates a Float16 from its bit representation
218- func FromBits ( bits uint16 ) Float16 {
219- return Float16 ( bits )
257+ // ToInt converts Float16 to int (truncates toward zero)
258+ func ( f Float16 ) ToInt () int {
259+ return int ( f . ToFloat32 () )
220260}
221261
222262// String returns a string representation of the Float16 value
@@ -240,83 +280,11 @@ func (f Float16) String() string {
240280func (f Float16 ) GoString () string {
241281 return fmt .Sprintf ("float16.FromBits(0x%04x)" , uint16 (f ))
242282}
243-
244- // Class returns the IEEE 754 class of the floating-point value
245- type FloatClass int
246-
247- const (
248- ClassSignalingNaN FloatClass = iota
249- ClassQuietNaN
250- ClassNegativeInfinity
251- ClassNegativeNormal
252- ClassNegativeSubnormal
253- ClassNegativeZero
254- ClassPositiveZero
255- ClassPositiveSubnormal
256- ClassPositiveNormal
257- ClassPositiveInfinity
258- )
259-
260- // Class returns the IEEE 754 classification of the Float16 value
261- func (f Float16 ) Class () FloatClass {
262- if f .IsNaN () {
263- // Check if it's a signaling NaN (MSB of mantissa is 0)
264- if (f & 0x0200 ) == 0 {
265- return ClassSignalingNaN
266- }
267- return ClassQuietNaN
268- }
269-
270- sign := f .Signbit ()
271-
272- if f .IsInf (0 ) {
273- if sign {
274- return ClassNegativeInfinity
275- }
276- return ClassPositiveInfinity
277- }
278-
279- if f .IsZero () {
280- if sign {
281- return ClassNegativeZero
282- }
283- return ClassPositiveZero
284- }
285-
286- if f .IsSubnormal () {
287- if sign {
288- return ClassNegativeSubnormal
289- }
290- return ClassPositiveSubnormal
291- }
292-
293- // Normal number
294- if sign {
295- return ClassNegativeNormal
296- }
297- return ClassPositiveNormal
298- }
299-
300- // Utility functions for bit manipulation
301-
302- // extractComponents extracts sign, exponent, and mantissa from Float16
303- func (f Float16 ) extractComponents () (sign uint16 , exp uint16 , mant uint16 ) {
304- bits := uint16 (f )
305- sign = (bits & SignMask ) >> 15
306- exp = (bits & ExponentMask ) >> MantissaLen
307- mant = bits & MantissaMask
308- return
309- }
310-
311- // packComponents packs sign, exponent, and mantissa into Float16
312- func packComponents (sign , exp , mant uint16 ) Float16 {
313- return Float16 ((sign << 15 ) | (exp << MantissaLen ) | (mant & MantissaMask ))
283+ func (f Float16 ) ToInt32 () int32 {
284+ return int32 (f .ToFloat32 ())
314285}
315286
316- // leadingZeros counts leading zeros in a 10-bit mantissa
317- func leadingZeros10 (x uint16 ) int {
318- if x == 0 {
319- return 10
320- }
321- return bits .LeadingZeros16 (x << 6 ) - 6 // Shift to align with 16-bit and adjust
287+ // ToInt64 converts Float16 to int64 (truncates toward zero)
288+ func (f Float16 ) ToInt64 () int64 {
289+ return int64 (f .ToFloat32 ())
322290}
0 commit comments