|
4 | 4 |
|
5 | 5 | namespace intx |
6 | 6 | { |
7 | | -inline uint64_t reciprocal_naive(uint64_t d) noexcept |
| 7 | +inline uint64_t reciprocal_native(uint64_t d) noexcept |
8 | 8 | { |
9 | | - const auto u = uint128{~uint64_t{0}, ~d}; |
10 | | - uint64_t v{}; |
| 9 | +#ifdef __x86_64__ |
| 10 | + uint64_t _; // NOLINT(*-init-variables) |
| 11 | + uint64_t v; // NOLINT(*-init-variables) |
| 12 | + asm("divq %4" // NOLINT(*-no-assembler) |
| 13 | + : "=d"(_), "=a"(v) |
| 14 | + : "d"(~d), "a"(~uint64_t{0}), "r"(d)); |
| 15 | + return v; |
| 16 | +#else |
| 17 | + // Fallback implementation. |
| 18 | + return (uint128{~uint64_t{0}, ~d} / d)[0]; |
| 19 | +#endif |
| 20 | +} |
11 | 21 |
|
12 | | -#if __x86_64__ |
13 | | - uint64_t _{}; |
14 | | - asm("divq %4" : "=d"(_), "=a"(v) : "d"(u[1]), "a"(u[0]), "g"(d)); // NOLINT(hicpp-no-assembler) |
| 22 | +inline uint64_t reciprocal_builtin_uint128(uint64_t d) noexcept |
| 23 | +{ |
| 24 | +#ifdef INTX_HAS_BUILTIN_INT128 |
| 25 | + const auto u = (builtin_uint128{~d} << 64) | ~uint64_t{0}; |
| 26 | + return static_cast<uint64_t>(u / d); |
15 | 27 | #else |
16 | | - v = (u / d)[0]; |
| 28 | + // Fallback implementation. |
| 29 | + return (uint128{~uint64_t{0}, ~d} / d)[0]; |
17 | 30 | #endif |
| 31 | +} |
18 | 32 |
|
19 | | - return v; |
| 33 | +/// The copy of the GMP algorithm from "Improved division by invariant integers". |
| 34 | +constexpr uint64_t reciprocal_gmp(uint64_t d) noexcept |
| 35 | +{ |
| 36 | + INTX_REQUIRE(d & 0x8000000000000000); // Must be normalized. |
| 37 | + |
| 38 | + const uint64_t d9 = d >> 55; |
| 39 | + const uint32_t v0 = internal::reciprocal_table[static_cast<size_t>(d9 - 256)]; |
| 40 | + |
| 41 | + const uint64_t d40 = (d >> 24) + 1; |
| 42 | + const uint64_t v1 = (v0 << 11) - uint32_t(uint32_t{v0 * v0} * d40 >> 40) - 1; |
| 43 | + |
| 44 | + const uint64_t v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47); |
| 45 | + |
| 46 | + const uint64_t d0 = d & 1; |
| 47 | + const uint64_t d63 = (d >> 1) + d0; // ceil(d/2) |
| 48 | + const uint64_t e = ((v2 >> 1) & (0 - d0)) - (v2 * d63); |
| 49 | + const uint64_t v3 = (umul(v2, e)[1] >> 1) + (v2 << 31); |
| 50 | + |
| 51 | + const uint64_t v4 = v3 - (umul(v3, d) + d)[1] - d; |
| 52 | + return v4; |
20 | 53 | } |
21 | 54 | } // namespace intx |
0 commit comments