Skip to content

Commit ccc11b9

Browse files
committed
float16: restore rounding/conversion modes, error types, FloatClass; add Bits/FromBits; implement CopySign
1 parent 785a09b commit ccc11b9

1 file changed

Lines changed: 128 additions & 160 deletions

File tree

types.go

Lines changed: 128 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,72 @@ package float16
22

33
import (
44
"fmt"
5-
"math/bits"
5+
)
6+
7+
// ErrorCode represents specific error categories for float16 operations
8+
type ErrorCode int
9+
10+
const (
11+
ErrInvalidOperation ErrorCode = iota
12+
ErrNaN
13+
ErrInfinity
14+
ErrOverflow
15+
ErrUnderflow
16+
ErrDivisionByZero
17+
)
18+
19+
// Float16Error provides detailed error information for float16 operations
20+
type Float16Error struct {
21+
Op string
22+
Msg string
23+
Code ErrorCode
24+
}
25+
26+
func (e *Float16Error) Error() string {
27+
if e == nil {
28+
return "<nil>"
29+
}
30+
if e.Op != "" {
31+
return fmt.Sprintf("float16 %s: %s", e.Op, e.Msg)
32+
}
33+
return "float16: " + e.Msg
34+
}
35+
36+
// RoundingMode controls how results are rounded during conversion/arithmetic
37+
type RoundingMode int
38+
39+
const (
40+
// Round to nearest, ties to even
41+
RoundNearestEven RoundingMode = iota
42+
// Round toward zero (truncate)
43+
RoundTowardZero
44+
// Round toward +Inf
45+
RoundTowardPositive
46+
// Round toward -Inf
47+
RoundTowardNegative
48+
// Round to nearest, ties away from zero
49+
RoundNearestAway
50+
)
51+
52+
// ConversionMode controls error reporting behavior for conversions
53+
type ConversionMode int
54+
55+
const (
56+
// ModeIEEE performs IEEE-style conversion, saturating to Inf/0 with no errors
57+
ModeIEEE ConversionMode = iota
58+
// ModeStrict reports errors for NaN, Inf, overflow, and underflow
59+
ModeStrict
660
)
761

862
// Float16 represents a 16-bit IEEE 754 half-precision floating-point value
963
type Float16 uint16
1064

65+
// Bits returns the IEEE 754 half-precision bit pattern of f
66+
func (f Float16) Bits() uint16 { return uint16(f) }
67+
68+
// FromBits constructs a Float16 from its IEEE 754 half-precision bit pattern
69+
func FromBits(b uint16) Float16 { return Float16(b) }
70+
1171
// IEEE 754 half-precision format constants
1272
const (
1373
SignMask = 0x8000 // 0b1000000000000000 - Sign bit mask
@@ -31,10 +91,6 @@ const (
3191
Float32ExponentLen = 8 // Float32 exponent bits
3292
Float32MantissaLen = 23 // Float32 mantissa bits
3393

34-
// Float64 constants for conversion
35-
Float64ExponentBias = 1023 // IEEE 754 double precision bias
36-
Float64MantissaLen = 52 // Float64 mantissa bits
37-
3894
// Special exponent values
3995
ExponentZero = 0 // Zero and subnormal numbers
4096
ExponentInfinity = 31 // Infinity and NaN
@@ -66,73 +122,6 @@ const (
66122
NegativeQNaN Float16 = 0xFE00 // Negative quiet NaN
67123
)
68124

69-
// ConversionMode defines how conversions handle edge cases
70-
type ConversionMode int
71-
72-
const (
73-
// ModeIEEE uses standard IEEE 754 rounding and special value behavior
74-
ModeIEEE ConversionMode = iota
75-
// ModeStrict returns errors for overflow, underflow, and NaN
76-
ModeStrict
77-
// ModeFast optimizes for performance, may sacrifice some precision
78-
ModeFast
79-
// ModeExact preserves exact values when possible, errors on precision loss
80-
ModeExact
81-
)
82-
83-
// RoundingMode defines IEEE 754 rounding behavior
84-
type RoundingMode int
85-
86-
const (
87-
// RoundNearestEven rounds to nearest, ties to even (IEEE default)
88-
RoundNearestEven RoundingMode = iota
89-
// RoundNearestAway rounds to nearest, ties away from zero
90-
RoundNearestAway
91-
// RoundTowardZero truncates toward zero
92-
RoundTowardZero
93-
// RoundTowardPositive rounds toward +∞
94-
RoundTowardPositive
95-
// RoundTowardNegative rounds toward -∞
96-
RoundTowardNegative
97-
)
98-
99-
// Float16Error represents errors that can occur during Float16 operations
100-
type Float16Error struct {
101-
Op string // Operation that caused the error
102-
Value interface{} // Input value that caused the error
103-
Msg string // Error message
104-
Code ErrorCode // Specific error code
105-
}
106-
107-
// ErrorCode represents specific error types
108-
type ErrorCode int
109-
110-
const (
111-
ErrOverflow ErrorCode = iota
112-
ErrUnderflow
113-
ErrInvalidOperation
114-
ErrDivisionByZero
115-
ErrInexact
116-
ErrNaN
117-
ErrInfinity
118-
)
119-
120-
func (e *Float16Error) Error() string {
121-
if e.Value != nil {
122-
return fmt.Sprintf("float16.%s: %s (value: %v)", e.Op, e.Msg, e.Value)
123-
}
124-
return fmt.Sprintf("float16.%s: %s", e.Op, e.Msg)
125-
}
126-
127-
// Predefined error instances
128-
var (
129-
ErrOverflowError = &Float16Error{Code: ErrOverflow, Msg: "value too large for float16"}
130-
ErrUnderflowError = &Float16Error{Code: ErrUnderflow, Msg: "value too small for float16"}
131-
ErrNaNError = &Float16Error{Code: ErrNaN, Msg: "NaN in strict mode"}
132-
ErrInfinityError = &Float16Error{Code: ErrInfinity, Msg: "infinity in strict mode"}
133-
ErrDivByZeroError = &Float16Error{Code: ErrDivisionByZero, Msg: "division by zero"}
134-
)
135-
136125
// IsZero returns true if the Float16 value represents zero (positive or negative)
137126
func (f Float16) IsZero() bool {
138127
return (f & 0x7FFF) == 0
@@ -178,6 +167,61 @@ func (f Float16) IsSubnormal() bool {
178167
return exp == ExponentZero && mant != 0
179168
}
180169

170+
// FloatClass enumerates the IEEE 754 classification of a Float16 value
171+
type FloatClass int
172+
173+
const (
174+
ClassPositiveZero FloatClass = iota
175+
ClassNegativeZero
176+
ClassPositiveSubnormal
177+
ClassNegativeSubnormal
178+
ClassPositiveNormal
179+
ClassNegativeNormal
180+
ClassPositiveInfinity
181+
ClassNegativeInfinity
182+
ClassQuietNaN
183+
ClassSignalingNaN
184+
)
185+
186+
// Class returns the IEEE 754 classification of the value
187+
func (f Float16) Class() FloatClass {
188+
bits := uint16(f)
189+
sign := (bits & SignMask) != 0
190+
exp := (bits & ExponentMask) >> MantissaLen
191+
mant := bits & MantissaMask
192+
193+
switch exp {
194+
case ExponentZero:
195+
if mant == 0 {
196+
if sign {
197+
return ClassNegativeZero
198+
}
199+
return ClassPositiveZero
200+
}
201+
if sign {
202+
return ClassNegativeSubnormal
203+
}
204+
return ClassPositiveSubnormal
205+
case ExponentInfinity:
206+
if mant == 0 {
207+
if sign {
208+
return ClassNegativeInfinity
209+
}
210+
return ClassPositiveInfinity
211+
}
212+
// NaN: distinguish quiet vs signaling by top mantissa bit (bit 9)
213+
if (mant & (1 << (MantissaLen - 1))) != 0 {
214+
return ClassQuietNaN
215+
}
216+
return ClassSignalingNaN
217+
default:
218+
if sign {
219+
return ClassNegativeNormal
220+
}
221+
return ClassPositiveNormal
222+
}
223+
}
224+
181225
// Sign returns the sign of the Float16 value: 1 for positive, -1 for negative, 0 for zero
182226
func (f Float16) Sign() int {
183227
if f.IsZero() {
@@ -204,19 +248,15 @@ func (f Float16) Neg() Float16 {
204248
return f ^ SignMask // Flip sign bit
205249
}
206250

207-
// CopySign returns a Float16 with the magnitude of f and the sign of sign
208-
func (f Float16) CopySign(sign Float16) Float16 {
209-
return (f & 0x7FFF) | (sign & SignMask)
210-
}
211-
212-
// Bits returns the underlying uint16 representation
213-
func (f Float16) Bits() uint16 {
214-
return uint16(f)
251+
// CopySign returns a value with the magnitude of f and the sign of s
252+
func (f Float16) CopySign(s Float16) Float16 {
253+
// Clear sign bit of f, then OR with sign bit of s
254+
return (f & ^Float16(SignMask)) | (s & Float16(SignMask))
215255
}
216256

217-
// FromBits creates a Float16 from its bit representation
218-
func FromBits(bits uint16) Float16 {
219-
return Float16(bits)
257+
// ToInt converts Float16 to int (truncates toward zero)
258+
func (f Float16) ToInt() int {
259+
return int(f.ToFloat32())
220260
}
221261

222262
// String returns a string representation of the Float16 value
@@ -240,83 +280,11 @@ func (f Float16) String() string {
240280
func (f Float16) GoString() string {
241281
return fmt.Sprintf("float16.FromBits(0x%04x)", uint16(f))
242282
}
243-
244-
// Class returns the IEEE 754 class of the floating-point value
245-
type FloatClass int
246-
247-
const (
248-
ClassSignalingNaN FloatClass = iota
249-
ClassQuietNaN
250-
ClassNegativeInfinity
251-
ClassNegativeNormal
252-
ClassNegativeSubnormal
253-
ClassNegativeZero
254-
ClassPositiveZero
255-
ClassPositiveSubnormal
256-
ClassPositiveNormal
257-
ClassPositiveInfinity
258-
)
259-
260-
// Class returns the IEEE 754 classification of the Float16 value
261-
func (f Float16) Class() FloatClass {
262-
if f.IsNaN() {
263-
// Check if it's a signaling NaN (MSB of mantissa is 0)
264-
if (f & 0x0200) == 0 {
265-
return ClassSignalingNaN
266-
}
267-
return ClassQuietNaN
268-
}
269-
270-
sign := f.Signbit()
271-
272-
if f.IsInf(0) {
273-
if sign {
274-
return ClassNegativeInfinity
275-
}
276-
return ClassPositiveInfinity
277-
}
278-
279-
if f.IsZero() {
280-
if sign {
281-
return ClassNegativeZero
282-
}
283-
return ClassPositiveZero
284-
}
285-
286-
if f.IsSubnormal() {
287-
if sign {
288-
return ClassNegativeSubnormal
289-
}
290-
return ClassPositiveSubnormal
291-
}
292-
293-
// Normal number
294-
if sign {
295-
return ClassNegativeNormal
296-
}
297-
return ClassPositiveNormal
298-
}
299-
300-
// Utility functions for bit manipulation
301-
302-
// extractComponents extracts sign, exponent, and mantissa from Float16
303-
func (f Float16) extractComponents() (sign uint16, exp uint16, mant uint16) {
304-
bits := uint16(f)
305-
sign = (bits & SignMask) >> 15
306-
exp = (bits & ExponentMask) >> MantissaLen
307-
mant = bits & MantissaMask
308-
return
309-
}
310-
311-
// packComponents packs sign, exponent, and mantissa into Float16
312-
func packComponents(sign, exp, mant uint16) Float16 {
313-
return Float16((sign << 15) | (exp << MantissaLen) | (mant & MantissaMask))
283+
func (f Float16) ToInt32() int32 {
284+
return int32(f.ToFloat32())
314285
}
315286

316-
// leadingZeros counts leading zeros in a 10-bit mantissa
317-
func leadingZeros10(x uint16) int {
318-
if x == 0 {
319-
return 10
320-
}
321-
return bits.LeadingZeros16(x<<6) - 6 // Shift to align with 16-bit and adjust
287+
// ToInt64 converts Float16 to int64 (truncates toward zero)
288+
func (f Float16) ToInt64() int64 {
289+
return int64(f.ToFloat32())
322290
}

0 commit comments

Comments
 (0)