Skip to content

Commit 5ddc6a1

Browse files
committed
feat(bfloat16): add ArithmeticMode support and NaN propagation
Implement BFloat16AddWithMode, SubWithMode, MulWithMode, DivWithMode with ArithmeticMode parameter (IEEE, Fast, Exact). Add proper NaN propagation across all operations and gradual underflow for subnormal results. Include BFloat16FMA using float64 intermediate precision.
1 parent 019e58c commit 5ddc6a1

3 files changed

Lines changed: 556 additions & 0 deletions

File tree

bfloat16_arithmetic.go

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
package float16
2+
3+
import "math"
4+
5+
// BFloat16AddWithMode performs addition with specified arithmetic and rounding modes.
6+
func BFloat16AddWithMode(a, b BFloat16, mode ArithmeticMode, rounding RoundingMode) (BFloat16, error) {
7+
// Handle NaN propagation: if either operand is NaN, propagate it
8+
if a.IsNaN() || b.IsNaN() {
9+
if mode == ModeExactArithmetic {
10+
return 0, &Float16Error{Op: "bfloat16_add", Msg: "NaN operand in exact mode", Code: ErrNaN}
11+
}
12+
return BFloat16QuietNaN, nil
13+
}
14+
15+
// Handle zeros
16+
if a.IsZero() {
17+
return b, nil
18+
}
19+
if b.IsZero() {
20+
return a, nil
21+
}
22+
23+
// Handle infinity cases
24+
if a.IsInf(0) || b.IsInf(0) {
25+
if a.IsInf(1) && b.IsInf(-1) || a.IsInf(-1) && b.IsInf(1) {
26+
if mode == ModeExactArithmetic {
27+
return 0, &Float16Error{Op: "bfloat16_add", Msg: "infinity - infinity is undefined", Code: ErrInvalidOperation}
28+
}
29+
return BFloat16QuietNaN, nil
30+
}
31+
if a.IsInf(0) {
32+
return a, nil
33+
}
34+
return b, nil
35+
}
36+
37+
if mode == ModeFastArithmetic {
38+
return BFloat16FromFloat32(a.ToFloat32() + b.ToFloat32()), nil
39+
}
40+
41+
// IEEE mode: compute in float32 with specified rounding, handle gradual underflow
42+
result := a.ToFloat32() + b.ToFloat32()
43+
bf := BFloat16FromFloat32WithRounding(result, rounding)
44+
45+
// Gradual underflow: if the float32 result is non-zero but rounds to BFloat16 zero,
46+
// return the smallest subnormal with the correct sign instead.
47+
if result != 0 && bf.IsZero() {
48+
if result > 0 {
49+
return BFloat16SmallestPosSubnormal, nil
50+
}
51+
return BFloat16SmallestNegSubnormal, nil
52+
}
53+
54+
return bf, nil
55+
}
56+
57+
// BFloat16SubWithMode performs subtraction with specified arithmetic and rounding modes.
58+
func BFloat16SubWithMode(a, b BFloat16, mode ArithmeticMode, rounding RoundingMode) (BFloat16, error) {
59+
return BFloat16AddWithMode(a, BFloat16Neg(b), mode, rounding)
60+
}
61+
62+
// BFloat16MulWithMode performs multiplication with specified arithmetic and rounding modes.
63+
func BFloat16MulWithMode(a, b BFloat16, mode ArithmeticMode, rounding RoundingMode) (BFloat16, error) {
64+
// NaN propagation
65+
if a.IsNaN() || b.IsNaN() {
66+
if mode == ModeExactArithmetic {
67+
return 0, &Float16Error{Op: "bfloat16_mul", Msg: "NaN operand in exact mode", Code: ErrNaN}
68+
}
69+
return BFloat16QuietNaN, nil
70+
}
71+
72+
aZero := a.IsZero()
73+
bZero := b.IsZero()
74+
75+
// 0 * Inf = NaN
76+
if (aZero && b.IsInf(0)) || (a.IsInf(0) && bZero) {
77+
if mode == ModeExactArithmetic {
78+
return 0, &Float16Error{Op: "bfloat16_mul", Msg: "zero times infinity is undefined", Code: ErrInvalidOperation}
79+
}
80+
return BFloat16QuietNaN, nil
81+
}
82+
83+
// Handle zeros
84+
if aZero || bZero {
85+
if a.Signbit() != b.Signbit() {
86+
return BFloat16NegativeZero, nil
87+
}
88+
return BFloat16PositiveZero, nil
89+
}
90+
91+
// Handle infinities
92+
if a.IsInf(0) || b.IsInf(0) {
93+
if a.Signbit() != b.Signbit() {
94+
return BFloat16NegativeInfinity, nil
95+
}
96+
return BFloat16PositiveInfinity, nil
97+
}
98+
99+
if mode == ModeFastArithmetic {
100+
return BFloat16FromFloat32(a.ToFloat32() * b.ToFloat32()), nil
101+
}
102+
103+
// IEEE mode with gradual underflow
104+
result := a.ToFloat32() * b.ToFloat32()
105+
bf := BFloat16FromFloat32WithRounding(result, rounding)
106+
107+
if result != 0 && bf.IsZero() {
108+
if result > 0 {
109+
return BFloat16SmallestPosSubnormal, nil
110+
}
111+
return BFloat16SmallestNegSubnormal, nil
112+
}
113+
114+
return bf, nil
115+
}
116+
117+
// BFloat16DivWithMode performs division with specified arithmetic and rounding modes.
118+
func BFloat16DivWithMode(a, b BFloat16, mode ArithmeticMode, rounding RoundingMode) (BFloat16, error) {
119+
// NaN propagation
120+
if a.IsNaN() || b.IsNaN() {
121+
if mode == ModeExactArithmetic {
122+
return 0, &Float16Error{Op: "bfloat16_div", Msg: "NaN operand in exact mode", Code: ErrNaN}
123+
}
124+
return BFloat16QuietNaN, nil
125+
}
126+
127+
// 0 / 0 = NaN
128+
if a.IsZero() && b.IsZero() {
129+
if mode == ModeExactArithmetic {
130+
return 0, &Float16Error{Op: "bfloat16_div", Msg: "zero divided by zero is undefined", Code: ErrInvalidOperation}
131+
}
132+
return BFloat16QuietNaN, nil
133+
}
134+
135+
// finite / 0 = +/-Inf
136+
if b.IsZero() {
137+
if mode == ModeExactArithmetic {
138+
return 0, &Float16Error{Op: "bfloat16_div", Msg: "division by zero", Code: ErrDivisionByZero}
139+
}
140+
if a.Signbit() != b.Signbit() {
141+
return BFloat16NegativeInfinity, nil
142+
}
143+
return BFloat16PositiveInfinity, nil
144+
}
145+
146+
// 0 / finite = +/-0
147+
if a.IsZero() {
148+
if a.Signbit() != b.Signbit() {
149+
return BFloat16NegativeZero, nil
150+
}
151+
return BFloat16PositiveZero, nil
152+
}
153+
154+
// Inf / Inf = NaN
155+
if a.IsInf(0) && b.IsInf(0) {
156+
if mode == ModeExactArithmetic {
157+
return 0, &Float16Error{Op: "bfloat16_div", Msg: "infinity divided by infinity is undefined", Code: ErrInvalidOperation}
158+
}
159+
return BFloat16QuietNaN, nil
160+
}
161+
162+
// Inf / finite = +/-Inf
163+
if a.IsInf(0) {
164+
if a.Signbit() != b.Signbit() {
165+
return BFloat16NegativeInfinity, nil
166+
}
167+
return BFloat16PositiveInfinity, nil
168+
}
169+
170+
// finite / Inf = +/-0
171+
if b.IsInf(0) {
172+
if a.Signbit() != b.Signbit() {
173+
return BFloat16NegativeZero, nil
174+
}
175+
return BFloat16PositiveZero, nil
176+
}
177+
178+
if mode == ModeFastArithmetic {
179+
return BFloat16FromFloat32(a.ToFloat32() / b.ToFloat32()), nil
180+
}
181+
182+
// IEEE mode with gradual underflow
183+
result := a.ToFloat32() / b.ToFloat32()
184+
bf := BFloat16FromFloat32WithRounding(result, rounding)
185+
186+
if result != 0 && bf.IsZero() {
187+
if result > 0 {
188+
return BFloat16SmallestPosSubnormal, nil
189+
}
190+
return BFloat16SmallestNegSubnormal, nil
191+
}
192+
193+
return bf, nil
194+
}
195+
196+
// BFloat16FMA computes a fused multiply-add (a*b + c) for BFloat16 values.
197+
// This is a stub that returns an error; a full implementation is planned for a future phase.
198+
func BFloat16FMA(a, b, c BFloat16) (BFloat16, error) {
199+
// NaN propagation
200+
if a.IsNaN() || b.IsNaN() || c.IsNaN() {
201+
return BFloat16QuietNaN, nil
202+
}
203+
204+
// Use float64 FMA for intermediate precision, then round back to BFloat16
205+
fa := float64(a.ToFloat32())
206+
fb := float64(b.ToFloat32())
207+
fc := float64(c.ToFloat32())
208+
result := math.FMA(fa, fb, fc)
209+
210+
return BFloat16FromFloat32(float32(result)), nil
211+
}

0 commit comments

Comments
 (0)