Skip to content

Commit 178513f

Browse files
authored
Merge pull request #1565 from evoskuil/master
Optimize sha intrinsics.
2 parents 9b16970 + 462822d commit 178513f

16 files changed

Lines changed: 763 additions & 407 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ build
2626
/configure
2727
/libtool
2828
.dirstamp
29+
/.vs

include/bitcoin/system/hash/sha/algorithm.hpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ class algorithm
234234
INLINE static constexpr void input(buffer_t& buffer, const block_t& block) NOEXCEPT;
235235
INLINE static constexpr void input_left(auto& buffer, const half_t& half) NOEXCEPT;
236236
INLINE static constexpr void input_right(auto& buffer, const half_t& half) NOEXCEPT;
237+
INLINE static constexpr void reinput_left(auto& buffer, const auto& left) NOEXCEPT;
238+
INLINE static constexpr void reinput_right(auto& buffer, const auto& right) NOEXCEPT;
237239
INLINE static constexpr digest_t output(const state_t& state) NOEXCEPT;
238240

239241
/// Padding.
@@ -257,12 +259,6 @@ class algorithm
257259
static constexpr void pad_half(auto& buffer) NOEXCEPT;
258260
static constexpr void pad_n(auto& buffer, count_t blocks) NOEXCEPT;
259261

260-
/// Double hashing.
261-
/// -----------------------------------------------------------------------
262-
263-
static constexpr void reinput_left(auto& buffer, const auto& left) NOEXCEPT;
264-
static constexpr void reinput_right(auto& buffer, const auto& right) NOEXCEPT;
265-
266262
/// Iteration (message scheduling vectorized for multiple blocks).
267263
/// -----------------------------------------------------------------------
268264

@@ -386,9 +382,12 @@ class algorithm
386382
xint128_t message) NOEXCEPT;
387383

388384
template <bool Swap>
389-
static void native_rounds(xint128_t& lo, xint128_t& hi,
385+
INLINE static void native_rounds(xint128_t& lo, xint128_t& hi,
390386
const block_t& block) NOEXCEPT;
391387

388+
INLINE static void native_rounds(xint128_t& lo, xint128_t& hi,
389+
const half_t& left, const chunk_t& pad) NOEXCEPT;
390+
392391
template <bool Swap>
393392
static void native_transform(state_t& state, const auto& block) NOEXCEPT;
394393
static void native_transform(state_t& state, iblocks_t& blocks) NOEXCEPT;
@@ -409,8 +408,6 @@ class algorithm
409408
static digest_t native_double_hash(const half_t& half) NOEXCEPT;
410409
static digest_t native_double_hash(const half_t& left, const half_t& right) NOEXCEPT;
411410

412-
413-
414411
public:
415412
/// Summary public values.
416413
/// -----------------------------------------------------------------------

include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -243,25 +243,8 @@ template <size_t Lane>
243243
constexpr void CLASS::
244244
compress(state_t& state, const buffer_t& buffer) NOEXCEPT
245245
{
246-
if (std::is_constant_evaluated())
247-
{
248-
compress_<Lane>(state, buffer);
249-
}
250-
////else if constexpr (native)
251-
////{
252-
//// // Single block shani compression optimization.
253-
//// compress_native<Lane>(state, buffer);
254-
////}
255-
////else if constexpr (vector)
256-
////{
257-
//// // Compression is not vectorized within a block, however this is
258-
//// // feasible but may not be optimal (see round() comments).
259-
//// compress_vector(buffer);
260-
////}
261-
else
262-
{
263-
compress_<Lane>(state, buffer);
264-
}
246+
// block-internal vectorization is suboptimal.
247+
compress_<Lane>(state, buffer);
265248
}
266249

267250
} // namespace sha

include/bitcoin/system/impl/hash/sha/algorithm_double.ipp

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -28,59 +28,6 @@ namespace libbitcoin {
2828
namespace system {
2929
namespace sha {
3030

31-
// protected
32-
// ----------------------------------------------------------------------------
33-
34-
TEMPLATE
35-
INLINE constexpr void CLASS::
36-
reinput_left(auto& buffer, const auto& left) NOEXCEPT
37-
{
38-
using words = decltype(buffer);
39-
static_assert(array_count<words> >= SHA::state_words);
40-
41-
if (std::is_constant_evaluated())
42-
{
43-
buffer.at(0) = left.at(0);
44-
buffer.at(1) = left.at(1);
45-
buffer.at(2) = left.at(2);
46-
buffer.at(3) = left.at(3);
47-
buffer.at(4) = left.at(4);
48-
buffer.at(5) = left.at(5);
49-
buffer.at(6) = left.at(6);
50-
buffer.at(7) = left.at(7);
51-
}
52-
else
53-
{
54-
using word = array_element<words>;
55-
array_cast<word, SHA::state_words>(buffer) = left;
56-
}
57-
}
58-
59-
TEMPLATE
60-
INLINE constexpr void CLASS::
61-
reinput_right(auto& buffer, const auto& right) NOEXCEPT
62-
{
63-
using words = decltype(buffer);
64-
static_assert(array_count<words> >= SHA::state_words);
65-
66-
if (std::is_constant_evaluated())
67-
{
68-
buffer.at(8) = right.at(0);
69-
buffer.at(9) = right.at(1);
70-
buffer.at(10) = right.at(2);
71-
buffer.at(11) = right.at(3);
72-
buffer.at(12) = right.at(4);
73-
buffer.at(13) = right.at(5);
74-
buffer.at(14) = right.at(6);
75-
buffer.at(15) = right.at(7);
76-
}
77-
else
78-
{
79-
using word = array_element<words>;
80-
array_cast<word, SHA::state_words, SHA::state_words>(buffer) = right;
81-
}
82-
}
83-
8431
// public
8532
// ----------------------------------------------------------------------------
8633
// These benefit from avoiding state endian transition and reusing buffer.

include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -401,18 +401,13 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
401401
auto idigests = idigests_t{ to_half(size), data };
402402
const auto start = iblocks.size();
403403

404-
// Merkle hash vector dispatch.
404+
// Always use if available.
405405
if constexpr (use_x512)
406406
merkle_hash_vector<xint512_t>(idigests, iblocks);
407407

408-
// Use if shani is not available or at least 32 blocks.
409-
if constexpr (use_x256)
410-
{
411-
if constexpr (!native)
412-
merkle_hash_vector<xint256_t>(idigests, iblocks);
413-
else if (start >= 32_size)
414-
merkle_hash_vector<xint256_t>(idigests, iblocks);
415-
}
408+
// Only use if shani is not available.
409+
if constexpr (use_x256 && !native)
410+
merkle_hash_vector<xint256_t>(idigests, iblocks);
416411

417412
// Only use if shani is not available.
418413
if constexpr (use_x128 && !native)

include/bitcoin/system/impl/hash/sha/algorithm_native.ipp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT
104104

105105
TEMPLATE
106106
template <bool Swap>
107-
void CLASS::
107+
INLINE void CLASS::
108108
native_rounds(xint128_t& lo, xint128_t& hi, const block_t& block) NOEXCEPT
109109
{
110110
const auto& wblock = array_cast<xint128_t>(block);
@@ -186,11 +186,13 @@ TEMPLATE
186186
void CLASS::
187187
native_transform(state_t& state, iblocks_t& blocks) NOEXCEPT
188188
{
189+
// Individual state vars are used vs. array to ensure register persistence.
189190
auto& wstate = array_cast<xint128_t>(state);
190191
auto lo = load(wstate[0]);
191192
auto hi = load(wstate[1]);
192193
shuffle(lo, hi);
193194

195+
// native_rounds must be inlined here (register boundary).
194196
for (auto& block: blocks)
195197
native_rounds<true>(lo, hi, block);
196198

@@ -208,7 +210,10 @@ native_transform(state_t& state, const auto& block) NOEXCEPT
208210
auto lo = load(wstate[0]);
209211
auto hi = load(wstate[1]);
210212
shuffle(lo, hi);
213+
214+
// native_rounds must be inlined here (register boundary).
211215
native_rounds<Swap>(lo, hi, array_cast<byte_t>(block));
216+
212217
unshuffle(lo, hi);
213218
store(wstate[0], lo);
214219
store(wstate[1], hi);
@@ -228,6 +233,8 @@ native_finalize(state_t& state, const words_t& pad) NOEXCEPT
228233
auto lo = load(wstate[0]);
229234
auto hi = load(wstate[1]);
230235
shuffle(lo, hi);
236+
237+
// native_rounds must be inlined here (register boundary).
231238
native_rounds<false>(lo, hi, array_cast<byte_t>(pad));
232239
unshuffle(lo, hi);
233240

0 commit comments

Comments
 (0)