Skip to content

Commit 38c084e

Browse files
committed
fix: Correct ToFloat64 conversion
1 parent d06ad6d commit 38c084e

3 files changed

Lines changed: 68 additions & 78 deletions

File tree

convert.go

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,50 @@ func (f Float16) ToFloat32() float32 {
501501

502502
// ToFloat64 converts a Float16 value to float64 with full precision
503503
func (f Float16) ToFloat64() float64 {
504-
return float64(f.ToFloat32())
504+
// Handle special cases
505+
if f.IsZero() {
506+
if f.Signbit() {
507+
return math.Copysign(0.0, -1.0)
508+
}
509+
return 0.0
510+
}
511+
512+
if f.IsNaN() {
513+
sign := uint64(0)
514+
if f.Signbit() {
515+
sign = 0x8000000000000000
516+
}
517+
payload := uint64(f & MantissaMask)
518+
return math.Float64frombits(sign | 0x7FF8000000000000 | (payload << (Float64MantissaLen - MantissaLen)))
519+
}
520+
521+
if f.IsInf(0) {
522+
if f.Signbit() {
523+
return math.Inf(-1)
524+
}
525+
return math.Inf(1)
526+
}
527+
528+
// Extract components
529+
sign, exp16, mant16 := f.extractComponents()
530+
531+
if exp16 == 0 { // Subnormal
532+
// val = sign * 0.mantissa * 2^-14
533+
// smallest subnormal: 1 * 2^-10 * 2^-14 = 2^-24
534+
// largest subnormal: (1023/1024) * 2^-14
535+
val := float64(mant16) * math.Pow(2, -24)
536+
if sign != 0 {
537+
return -val
538+
}
539+
return val
540+
}
541+
542+
// Normal number
543+
exp64 := int64(exp16) - ExponentBias + Float64ExponentBias
544+
mant64 := uint64(mant16) << (Float64MantissaLen - MantissaLen)
545+
546+
bits := (uint64(sign) << 63) | (uint64(exp64) << 52) | mant64
547+
return math.Float64frombits(bits)
505548
}
506549

507550
// FromFloat32 converts a float32 to Float16 (with potential precision loss)

float16_test.go

Lines changed: 20 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -516,110 +516,53 @@ func TestToFloat64(t *testing.T) {
516516
name string
517517
input Float16
518518
expected float64
519-
exact bool // Whether we expect exact match
520519
}{
521520
// Special values
522-
{"positive zero", PositiveZero, 0.0, true},
523-
{"negative zero", NegativeZero, math.Copysign(0.0, -1.0), true},
524-
{"positive infinity", PositiveInfinity, math.Inf(1), true},
525-
{"negative infinity", NegativeInfinity, math.Inf(-1), true},
526-
{"quiet NaN", NaN(), math.NaN(), false}, // NaN comparison is special
521+
{"positive zero", PositiveZero, 0.0},
522+
{"negative zero", NegativeZero, math.Copysign(0.0, -1.0)},
523+
{"positive infinity", PositiveInfinity, math.Inf(1)},
524+
{"negative infinity", NegativeInfinity, math.Inf(-1)},
525+
{"quiet NaN", NaN(), math.NaN()},
527526

528527
// Normal numbers
529-
{"one", Float16(0x3c00), 1.0, true},
530-
{"negative one", Float16(0xbc00), -1.0, true},
531-
{"two", Float16(0x4000), 2.0, true},
532-
{"half", Float16(0x3800), 0.5, true},
533-
{"small normal", Float16(0x0400), 0.00006103515625, true}, // 2^-14
528+
{"one", Float16(0x3c00), 1.0},
529+
{"negative one", Float16(0xbc00), -1.0},
530+
{"two", Float16(0x4000), 2.0},
531+
{"half", Float16(0x3800), 0.5},
532+
{"smallest normal", Float16(0x0400), 6.103515625e-05}, // 2^-14
534533

535-
// Subnormal numbers - using actual values from debug output
536-
{"smallest subnormal", Float16(0x0001), 0.00049591064453125, true},
537-
{"largest subnormal", Float16(0x03ff), 0.0, true},
538-
539-
// Numbers with exact float32 representation
540-
{"0.1", ToFloat16(0.1), 0.0999755859375, true}, // 0.1 in float16 is 0.0999755859375
534+
// Subnormal numbers
535+
{"smallest subnormal", Float16(0x0001), 5.960464477539063e-08}, // 2^-24
536+
{"largest subnormal", Float16(0x03ff), 6.097555160522461e-05}, // (1-2^-10) * 2^-14
541537

542538
// Large numbers
543-
{"65504 (max half-precision)", Float16(0x7bff), 65504.0, true},
544-
{"-65504 (min half-precision)", Float16(0xfbff), -65504.0, true},
539+
{"max value", MaxValue, 65504.0},
540+
{"min value", MinValue, -65504.0},
545541
}
546542

547543
for _, tt := range tests {
548544
t.Run(tt.name, func(t *testing.T) {
549545
result := tt.input.ToFloat64()
550546

551-
// Special handling for NaN
552547
if tt.input.IsNaN() {
553548
if !math.IsNaN(result) {
554549
t.Errorf("Expected NaN, got %v", result)
555550
}
556551
return
557552
}
558553

559-
if tt.exact {
560-
if result != tt.expected {
554+
if result != tt.expected {
555+
// Allow for a small tolerance for floating point comparisons
556+
if math.Abs(result-tt.expected) > 1e-12 {
561557
t.Errorf("ToFloat64() = %v, want %v", result, tt.expected)
562558
}
563-
} else {
564-
// For non-exact matches, check if they're within a small epsilon
565-
// This is particularly important for subnormal numbers
566-
diff := math.Abs(result - tt.expected)
567-
if diff > 1e-10 {
568-
t.Errorf("ToFloat64() = %v, want close to %v (diff: %v)",
569-
result, tt.expected, diff)
570-
}
571559
}
572560

573-
// Additional check: Ensure the sign is preserved
574-
if math.Signbit(float64(result)) != math.Signbit(float64(tt.expected)) {
575-
t.Errorf("Sign mismatch: got %v, want %v",
576-
math.Signbit(result), math.Signbit(tt.expected))
577-
}
578-
579-
// For normal numbers, check bit patterns
580-
if tt.input.IsNormal() {
581-
// Convert back to float32 and then to float64 to match the implementation
582-
expectedFrom32 := float64(tt.input.ToFloat32())
583-
if math.Float64bits(result) != math.Float64bits(expectedFrom32) {
584-
t.Errorf("Bit pattern mismatch: got %016x, want %016x",
585-
math.Float64bits(result), math.Float64bits(expectedFrom32))
586-
}
587-
} else if tt.input.IsSubnormal() {
588-
// For subnormal numbers, we use exact values from the implementation
589-
if result != tt.expected {
590-
t.Errorf("Unexpected value for %s: got %v, want %v",
591-
tt.name, result, tt.expected)
592-
}
561+
if math.Signbit(result) != math.Signbit(tt.expected) {
562+
t.Errorf("Sign mismatch: got %v, want %v", math.Signbit(result), math.Signbit(tt.expected))
593563
}
594564
})
595565
}
596-
597-
// Test with all possible exponent values
598-
t.Run("exhaustive exponent test", func(t *testing.T) {
599-
for exp := 0; exp <= 0x1f; exp++ {
600-
// Skip subnormal exponent (0) and special values (31)
601-
if exp == 0 || exp == 0x1f {
602-
continue
603-
}
604-
605-
// Test with different mantissa patterns
606-
for _, m := range []uint16{0x000, 0x155, 0x2aa, 0x3ff} {
607-
f16 := Float16(uint16(exp)<<10 | m)
608-
if f16.IsNaN() || f16.IsInf(0) {
609-
continue
610-
}
611-
612-
f64 := f16.ToFloat64()
613-
f32 := f16.ToFloat32()
614-
615-
// The result should match float32 conversion
616-
if float64(f32) != f64 {
617-
t.Errorf("Mismatch for 0x%04x: ToFloat64()=%v, float64(ToFloat32())=%v",
618-
uint16(f16), f64, float64(f32))
619-
}
620-
}
621-
}
622-
})
623566
}
624567

625568
func TestFromFloat64(t *testing.T) {

types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ const (
3131
Float32ExponentLen = 8 // Float32 exponent bits
3232
Float32MantissaLen = 23 // Float32 mantissa bits
3333

34+
// Float64 constants for conversion
35+
Float64ExponentBias = 1023 // IEEE 754 double precision bias
36+
Float64MantissaLen = 52 // Float64 mantissa bits
37+
3438
// Special exponent values
3539
ExponentZero = 0 // Zero and subnormal numbers
3640
ExponentInfinity = 31 // Infinity and NaN

0 commit comments

Comments
 (0)