Skip to content

Commit 5480d11

Browse files
authored
Merge pull request #1559 from evoskuil/master
Replace buffered shani with rotating.
2 parents 2df85a0 + 99167b1 commit 5480d11

14 files changed

Lines changed: 369 additions & 440 deletions

include/bitcoin/system/data/iterable.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ class iterable
175175
return begin_;
176176
}
177177

178-
template <size_t Elements>
178+
template <size_t Elements = one>
179179
inline iterable& advance() NOEXCEPT
180180
{
181181
// This is safe for overflow, will advance to end.
@@ -185,7 +185,7 @@ class iterable
185185
return *this;
186186
}
187187

188-
template <size_t Elements>
188+
template <size_t Elements = one>
189189
inline const std_array<value_t, Elements>& to_array() const NOEXCEPT
190190
{
191191
return unsafe_array_cast<value_t, Elements>(begin_);

include/bitcoin/system/hash/sha/algorithm.hpp

Lines changed: 20 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,11 @@ class algorithm
281281
INLINE static void iterate_vector(state_t& state,
282282
iblocks_t& blocks) NOEXCEPT;
283283

284+
template <size_t Size>
285+
INLINE static void iterate_native(state_t& state,
286+
const ablocks_t<Size>& blocks) NOEXCEPT;
287+
INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT;
288+
284289
template <size_t Size>
285290
INLINE static constexpr void iterate_(state_t& state,
286291
const ablocks_t<Size>& blocks) NOEXCEPT;
@@ -317,7 +322,8 @@ class algorithm
317322
const xstate_t<xWord>& xstate) NOEXCEPT;
318323

319324
template <typename xWord, if_extended<xWord> = true>
320-
INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT;
325+
INLINE static void merkle_hash_vector(idigests_t& digests,
326+
iblocks_t& blocks) NOEXCEPT;
321327
INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT;
322328
VCONSTEXPR static void merkle_hash_(digests_t& digests,
323329
size_t offset=zero) NOEXCEPT;
@@ -330,10 +336,10 @@ class algorithm
330336
auto x6, auto x7, auto x8) NOEXCEPT;
331337

332338
template<size_t Round, size_t Offset>
333-
INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
339+
INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
334340

335341
template<size_t Round>
336-
INLINE static void prepare8(buffer_t& buffer) NOEXCEPT;
342+
INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT;
337343

338344
template <typename xWord>
339345
INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
@@ -357,45 +363,24 @@ class algorithm
357363
/// Native SHA optimizations (single blocks).
358364
/// -----------------------------------------------------------------------
359365

360-
template<size_t Round>
361-
INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
362-
static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
363-
364-
template <typename xWord>
365-
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
366-
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
367-
368-
template<size_t Round, size_t Lane>
369-
INLINE static void round_native(wstate_t<xint128_t>& state,
370-
const wbuffer_t<xint128_t>& wk) NOEXCEPT;
371-
372-
INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
373-
INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
374-
INLINE static void summarize_native(wstate_t<xint128_t>& out,
375-
const wstate_t<xint128_t>& in) NOEXCEPT;
366+
INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
367+
INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
368+
INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
369+
INLINE static void prepare(xint128_t& message0, xint128_t message1,
370+
xint128_t& message2) NOEXCEPT;
376371

377-
template <size_t Lane>
378-
static void compress_native(wstate_t<xint128_t>& state,
379-
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
380-
381-
template <typename xWord, size_t Lane>
382-
INLINE static void compress_native(xstate_t<xWord>& xstate,
383-
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
384-
385-
template <typename xWord, size_t Lane>
386-
INLINE static void compress_native(state_t& state,
387-
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
372+
template <size_t Round>
373+
INLINE static void round_4(xint128_t& state0, xint128_t& state1,
374+
xint128_t message) NOEXCEPT;
388375

389-
template <size_t Lane>
390-
INLINE static void compress_native(state_t& state,
391-
const buffer_t& buffer) NOEXCEPT;
376+
static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;
392377

393378
public:
394379
/// Summary public values.
395380
/// -----------------------------------------------------------------------
396381
static constexpr auto caching = Cached;
397-
static constexpr auto native = (use_shani || use_neon) &&
398-
!is_same_size<word_t, uint64_t>;
382+
static constexpr auto native = (use_shani || use_neon)
383+
&& (SHA::strength == 256 || SHA::strength == 160);
399384
static constexpr auto vector = (use_x128 || use_x256 || use_x512)
400385
&& !(build_x32 && is_same_size<word_t, uint64_t>);
401386
};

include/bitcoin/system/have.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,13 @@
110110
#define HAVE_XASSEMBLY
111111
#endif
112112

113+
/// DISABLED
113114
/// ARM Neon intrinsics.
114115
#if defined(HAVE_ARM)
115116
// -march=armv8-a+crc+crypto [all]
116117
// -arch arm64 [apple] (also -isysroot to phone sdk)
117118
#if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC)
118-
#define HAVE_NEON
119+
////#define HAVE_NEON
119120
#endif
120121
#endif
121122

include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT
7171

7272
e = /*a =*/ f::add<s>(f::add<s>(f::add<s>(f::rol<5, s>(a), fn(b, c, d)), e), wk);
7373
b = /*c =*/ f::rol<30, s>(b);
74-
75-
// SHA-NI
76-
// Four rounds (total rounds 80/4).
77-
// First round is add(e, w), then sha1nexte(e, w).
78-
// fk is round-based enumeration implying f selection and k value.
79-
// e1 = sha1nexte(e0, w);
80-
// abcd = sha1rnds4(abcd, e0, fk);
81-
// NEON
82-
// f is implied by k in wk.
83-
// e1 = vsha1h(vgetq_lane(abcd, 0);
84-
// vsha1cq(abcd, e0, vaddq(w, k));
8574
}
8675

8776
TEMPLATE
@@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h,
9786
const auto t = f::add<s>(f::add<s>(f::add<s>(Sigma1(e), choice(e, f, g)), h), wk);
9887
d = /*e =*/ f::add<s>(d, t);
9988
h = /*a =*/ f::add<s>(f::add<s>(Sigma0(a), majority(a, b, c)), t);
100-
101-
// Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state).
102-
// SHA-NI
103-
// const auto value = add(w, k);
104-
// abcd = sha256rnds2(abcd, efgh, value);
105-
// efgh = sha256rnds2(efgh, abcd, shuffle(value));
106-
// NEON
107-
// const auto value = vaddq(w, k);
108-
// abcd = vsha256hq(abcd, efgh, value);
109-
// efgh = vsha256h2q(efgh, abcd, value);
11089
}
11190

11291
TEMPLATE
@@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT
125104
state[(SHA::rounds + 3 - Round) % SHA::state_words],
126105
state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e
127106
extract<word, Lane>(wk[Round]));
128-
129-
// SHA-NI/NEON
130-
// State packs in 128 (one state variable), reduces above to 1 out[].
131-
// Input value is 128 (w). Constants (k) statically initialized as 128.
132107
}
133108
else
134109
{
@@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT
142117
state[(SHA::rounds + 6 - Round) % SHA::state_words],
143118
state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h
144119
extract<word, Lane>(wk[Round]));
145-
146-
// SHA-NI/NEON
147-
// Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1).
148-
// Input value is 128 (w). Constants (k) statically initialized as 128.
149120
}
150121
}
151122

@@ -276,11 +247,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT
276247
{
277248
compress_<Lane>(state, buffer);
278249
}
279-
else if constexpr (native)
280-
{
281-
// Single block shani compression optimization.
282-
compress_native<Lane>(state, buffer);
283-
}
250+
////else if constexpr (native)
251+
////{
252+
//// // Single block shani compression optimization.
253+
//// compress_native<Lane>(state, buffer);
254+
////}
284255
////else if constexpr (vector)
285256
////{
286257
//// // Compression is not vectorized within a block, however this is

include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
228228
{
229229
if (blocks.size() >= min_lanes)
230230
{
231-
auto iblocks = iblocks_t{ array_cast<byte_t>(blocks) };
231+
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
232232
iterate_vector(state, iblocks);
233233
}
234234
else
@@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
237237
}
238238
}
239239

240+
// Native SHA
241+
// ============================================================================
242+
// www.intel.com/content/dam/develop/external/us/en/documents/
243+
// intel-sha-extensions-white-paper-402097.pdf
244+
245+
TEMPLATE
246+
INLINE void CLASS::
247+
iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT
248+
{
249+
native_rounds(state, blocks);
250+
}
251+
252+
TEMPLATE
253+
template <size_t Size>
254+
INLINE void CLASS::
255+
iterate_native(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
256+
{
257+
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
258+
native_rounds(state, iblocks);
259+
}
260+
261+
// Dispatch and normal forms.
262+
// ============================================================================
263+
// protected
264+
240265
TEMPLATE
241266
template <size_t Size>
242267
INLINE constexpr void CLASS::
@@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
273298
{
274299
iterate_(state, blocks);
275300
}
276-
else if constexpr (native)
301+
else if constexpr (native && SHA::strength == 256)
277302
{
278-
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
279-
// Multiple block shani message schduling and compression optimization.
280-
iterate_(state, blocks);
303+
iterate_native(state, blocks);
281304
}
282305
else if constexpr (vector)
283306
{
@@ -294,11 +317,9 @@ TEMPLATE
294317
INLINE void CLASS::
295318
iterate(state_t& state, iblocks_t& blocks) NOEXCEPT
296319
{
297-
if constexpr (native)
320+
if constexpr (native && SHA::strength == 256)
298321
{
299-
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
300-
// Multiple block shani message schduling and compression optimization.
301-
iterate_(state, blocks);
322+
iterate_native(state, blocks);
302323
}
303324
else if constexpr (vector)
304325
{

include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,27 +50,26 @@ template<size_t Round, typename xWord>
5050
INLINE void CLASS::
5151
vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
5252
{
53-
constexpr auto s = SHA::word_bits;
5453
constexpr auto lanes = capacity<xWord, word_t>;
5554
constexpr auto r = Round * lanes;
5655

5756
if constexpr (lanes == 16)
5857
{
59-
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
58+
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
6059
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
6160
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
6261
K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
6362
K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15]));
6463
}
6564
else if constexpr (lanes == 8)
6665
{
67-
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
66+
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
6867
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
6968
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
7069
}
7170
else if constexpr (lanes == 4)
7271
{
73-
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
72+
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
7473
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
7574
}
7675
}

include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
421421
// ----------------------------------------------------------------------------
422422
// public
423423

424+
// TODO: consider eliminating endianness conversions internal to the root
425+
// computation, instead converting on way in and way out ony, and using non
426+
// converting input/output (nop) functions.
427+
424428
TEMPLATE
425429
VCONSTEXPR typename CLASS::digest_t CLASS::
426430
merkle_root(digests_t&& digests) NOEXCEPT

0 commit comments

Comments
 (0)