mirror of
https://github.com/dogecoin/dogecoin.git
synced 2026-01-31 10:30:52 +00:00
Updated ARMv8 intrinsics for SHA-512
Updated cross build for ci environment Updated experimental build to focal Unified cross and native ARMv8.2 build options Updated experimental build in ci
This commit is contained in:
parent
54d5954761
commit
be9df91a4d
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -73,13 +73,13 @@ jobs:
|
||||
goal: install
|
||||
- name: aarch64-linux-sha512-experimental
|
||||
host: aarch64-linux-gnu
|
||||
os: ubuntu-18.04
|
||||
os: ubuntu-20.04
|
||||
packages: g++-aarch64-linux-gnu
|
||||
run-tests: false
|
||||
check-security: true
|
||||
check-symbols: false
|
||||
dep-opts: "NO_QT=1"
|
||||
config-opts: "--with-armv82-crypto-cross --enable-zmq --enable-glibc-back-compat --disable-tests LDFLAGS=-static-libstdc++"
|
||||
config-opts: "--with-armv82-crypto --enable-zmq --enable-glibc-back-compat --disable-tests LDFLAGS=-static-libstdc++"
|
||||
goal: install
|
||||
- name: aarch64-linux
|
||||
host: aarch64-linux-gnu
|
||||
|
||||
27
configure.ac
27
configure.ac
@ -189,17 +189,11 @@ AC_ARG_WITH([armv8-crypto],
|
||||
[armv8_crypto=$withval],
|
||||
[armv8_crypto=no])
|
||||
|
||||
AC_ARG_WITH([armv82-crypto-cross],
|
||||
[AS_HELP_STRING([--with-armv82-crypto-cross],
|
||||
[Build with armv8.2 crypto sha512 cross (default is no)])],
|
||||
[armv82_crypto_cross=$withval],
|
||||
[armv82_crypto_cross=no])
|
||||
|
||||
AC_ARG_WITH([armv82-crypto-native],
|
||||
[AS_HELP_STRING([--with-armv82-crypto-native],
|
||||
[Build with armv8.2 crypto sha512 native (default is no)])],
|
||||
[armv82_crypto_native=$withval],
|
||||
[armv82_crypto_native=no])
|
||||
AC_ARG_WITH([armv82-crypto],
|
||||
[AS_HELP_STRING([--with-armv82-crypto],
|
||||
[Build with armv8.2 crypto sha512 (default is no)])],
|
||||
[armv82_crypto=$withval],
|
||||
[armv82_crypto=no])
|
||||
|
||||
AC_ARG_WITH([protoc-bindir],[AS_HELP_STRING([--with-protoc-bindir=BIN_DIR],[specify protoc bin path])], [protoc_bin_path=$withval], [])
|
||||
|
||||
@ -828,15 +822,8 @@ if test x$armv8_crypto = xyes; then
|
||||
CXXFLAGS="$CXXFLAGS -march=armv8-a+crypto"
|
||||
fi
|
||||
|
||||
if test x$armv82_crypto_cross = xyes; then
|
||||
AC_MSG_CHECKING([whether to build with armv8.2 crypto sha512 cross])
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(USE_ARMV82, 1, [Define this symbol if armv8.2 crypto works])
|
||||
CXXFLAGS="$CXXFLAGS -march=armv8-a+crypto"
|
||||
fi
|
||||
|
||||
if test x$armv82_crypto_native = xyes; then
|
||||
AC_MSG_CHECKING([whether to build with armv8.2 crypto sha512 native])
|
||||
if test x$armv82_crypto = xyes; then
|
||||
AC_MSG_CHECKING([whether to build with armv8.2 crypto sha512])
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(USE_ARMV82, 1, [Define this symbol if armv8.2 crypto works])
|
||||
CXXFLAGS="$CXXFLAGS -march=armv8.2-a+crypto+sha3"
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
# endif
|
||||
#endif /** ARM Headers */
|
||||
|
||||
static const uint64_t K512[] =
|
||||
static const uint64_t sha512_round_constants[] =
|
||||
{
|
||||
0x428a2f98d728ae22, 0x7137449123ef65cd,
|
||||
0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
|
||||
@ -95,30 +95,201 @@ void inline Round(uint64_t a, uint64_t b, uint64_t c, uint64_t& d, uint64_t e, u
|
||||
d += t1;
|
||||
h = t1 + t2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#elif USE_ARMV82
|
||||
/** One round of SHA512 (ARMV8.2). */
|
||||
#define Round (n, a, b, c, d, e, f, g, h, w0, w1, w2, w3, w4)
|
||||
{
|
||||
uint64x2_t t, fg, de;
|
||||
t = vaddq_u64 (a, vld1q_u64 (K512 + n * 2));
|
||||
t = vreinterpretq_u64_u8 (vextq_u8 (vreinterpretq_u8_u64 (t),
|
||||
vreinterpretq_u8_u64 (t), 8));
|
||||
de = vreinterpretq_u64_u8 (vextq_u8 (vreinterpretq_u8_u64 (w1),
|
||||
vreinterpretq_u8_u64 (w2), 8));
|
||||
fg = vreinterpretq_u64_u8 (vextq_u8 (vreinterpretq_u8_u64 (w2),
|
||||
vreinterpretq_u8_u64 (w3), 8));
|
||||
w3 = vaddq_u64 (w3, t);
|
||||
w3 = vsha512hq_u64 (w3, fg, de);
|
||||
w4 = vaddq_u64 (w1, w3);
|
||||
w3 = vsha512h2q_u64 (w3, w1, w0);
|
||||
if (n <32) {
|
||||
a = vsha512su0q_u64 (a, b);
|
||||
a = vsha512su1q_u64 (a, h,
|
||||
vextq_u8 (vreinterpretq_u8_u64 (e),
|
||||
vreinterpretq_u8_u64 (f), 8));
|
||||
}
|
||||
}
|
||||
#ifdef USE_ARMV82
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
* Hardware-accelerated implementation of SHA-512 using Arm NEON.
|
||||
*/
|
||||
|
||||
typedef struct sha512_neon_core sha512_neon_core;
|
||||
struct sha512_neon_core {
|
||||
uint64x2_t ab, cd, ef, gh;
|
||||
};
|
||||
|
||||
static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
|
||||
{
|
||||
return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
|
||||
}
|
||||
|
||||
static inline uint64x2_t sha512_neon_schedule_update(
|
||||
uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
|
||||
{
|
||||
/*
|
||||
* vsha512su0q_u64() takes words from a long way back in the
|
||||
* schedule and performs the sigma_0 half of the computation of
|
||||
* the next two 64-bit message-schedule words.
|
||||
*
|
||||
* vsha512su1q_u64() combines the result of that with the sigma_1
|
||||
* steps, to output the finished version of those two words. The
|
||||
* total amount of input data it requires fits nicely into three
|
||||
* 128-bit vector registers, but one of those registers is
|
||||
* misaligned compared to the 128-bit chunks that the message
|
||||
* schedule is stored in. So we use vextq_u64 to make one of its
|
||||
* input words out of the second half of m4 and the first half of
|
||||
* m3.
|
||||
*/
|
||||
return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
|
||||
}
|
||||
|
||||
static inline void sha512_neon_round2(
|
||||
unsigned round_index, uint64x2_t schedule_words,
|
||||
uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
|
||||
{
|
||||
/*
|
||||
* vsha512hq_u64 performs the Sigma_1 and Ch half of the
|
||||
* computation of two rounds of SHA-512 (including feeding back
|
||||
* one of the outputs from the first of those half-rounds into the
|
||||
* second one).
|
||||
*
|
||||
* vsha512h2q_u64 combines the result of that with the Sigma_0 and
|
||||
* Maj steps, and outputs one 128-bit vector that replaces the gh
|
||||
* piece of the input hash state, and a second that updates cd by
|
||||
* addition.
|
||||
*
|
||||
* Similarly to vsha512su1q_u64 above, some of the input registers
|
||||
* expected by these instructions are misaligned by 64 bits
|
||||
* relative to the chunks we've divided the hash state into, so we
|
||||
* have to start by making 'de' and 'fg' words out of our input
|
||||
* cd,ef,gh, using vextq_u64.
|
||||
*
|
||||
* Also, one of the inputs to vsha512hq_u64 is expected to contain
|
||||
* the results of summing gh + two round constants + two words of
|
||||
* message schedule, but the two words of the message schedule
|
||||
* have to be the opposite way round in the vector register from
|
||||
* the way that vsha512su1q_u64 output them. Hence, there's
|
||||
* another vextq_u64 in here that swaps the two halves of the
|
||||
* initial_sum vector register.
|
||||
*
|
||||
* (This also means that I don't have to prepare a specially
|
||||
* reordered version of the sha512_round_constants[] array: as
|
||||
* long as I'm unavoidably doing a swap at run time _anyway_, I
|
||||
* can load from the normally ordered version of that array, and
|
||||
* just take care to fold in that data _before_ the swap rather
|
||||
* than after.)
|
||||
*/
|
||||
|
||||
/* Load two round constants, with the first one in the low half */
|
||||
uint64x2_t round_constants = vld1q_u64(
|
||||
sha512_round_constants + round_index);
|
||||
|
||||
/* Add schedule words to round constants */
|
||||
uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
|
||||
|
||||
/* Swap that sum around so the word used in the first of the two
|
||||
* rounds is in the _high_ half of the vector, matching where h
|
||||
* lives in the gh vector */
|
||||
uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
|
||||
|
||||
/* Add gh to that, now that they're matching ways round */
|
||||
uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
|
||||
|
||||
/* Make the misaligned de and fg words */
|
||||
uint64x2_t de = vextq_u64(*cd, *ef, 1);
|
||||
uint64x2_t fg = vextq_u64(*ef, *gh, 1);
|
||||
|
||||
/* Now we're ready to put all the pieces together. The output from
|
||||
* vsha512h2q_u64 can be used directly as the new gh, and the
|
||||
* output from vsha512hq_u64 is simultaneously the intermediate
|
||||
* value passed to h2 and the thing you have to add on to cd. */
|
||||
uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
|
||||
*gh = vsha512h2q_u64(intermed, *cd, *ab);
|
||||
*cd = vaddq_u64(*cd, intermed);
|
||||
}
|
||||
|
||||
static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
|
||||
{
|
||||
uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
|
||||
uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
|
||||
|
||||
s0 = sha512_neon_load_input(p + 16*0);
|
||||
sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
|
||||
s1 = sha512_neon_load_input(p + 16*1);
|
||||
sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
|
||||
s2 = sha512_neon_load_input(p + 16*2);
|
||||
sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
|
||||
s3 = sha512_neon_load_input(p + 16*3);
|
||||
sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
|
||||
s4 = sha512_neon_load_input(p + 16*4);
|
||||
sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
|
||||
s5 = sha512_neon_load_input(p + 16*5);
|
||||
sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
|
||||
s6 = sha512_neon_load_input(p + 16*6);
|
||||
sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
|
||||
s7 = sha512_neon_load_input(p + 16*7);
|
||||
sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
|
||||
s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
|
||||
sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
|
||||
s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
|
||||
sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
|
||||
s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
|
||||
sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
|
||||
s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
|
||||
sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
|
||||
s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
|
||||
sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
|
||||
s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
|
||||
sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
|
||||
s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
|
||||
sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
|
||||
s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
|
||||
sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
|
||||
s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
|
||||
sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
|
||||
s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
|
||||
sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
|
||||
s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
|
||||
sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
|
||||
s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
|
||||
sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
|
||||
s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
|
||||
sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
|
||||
s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
|
||||
sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
|
||||
s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
|
||||
sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
|
||||
s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
|
||||
sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
|
||||
s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
|
||||
sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
|
||||
s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
|
||||
sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
|
||||
s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
|
||||
sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
|
||||
s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
|
||||
sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
|
||||
s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
|
||||
sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
|
||||
s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
|
||||
sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
|
||||
s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
|
||||
sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
|
||||
s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
|
||||
sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
|
||||
s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
|
||||
sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
|
||||
s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
|
||||
sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
|
||||
s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
|
||||
sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
|
||||
s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
|
||||
sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
|
||||
s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
|
||||
sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
|
||||
s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
|
||||
sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
|
||||
s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
|
||||
sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
|
||||
s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
|
||||
sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
|
||||
|
||||
core->ab = vaddq_u64(core->ab, ab);
|
||||
core->cd = vaddq_u64(core->cd, cd);
|
||||
core->ef = vaddq_u64(core->ef, ef);
|
||||
core->gh = vaddq_u64(core->gh, gh);
|
||||
}
|
||||
#endif
|
||||
|
||||
/** Initialize SHA-512 state. */
|
||||
@ -140,7 +311,25 @@ void Transform(uint64_t* s, const unsigned char* chunk)
|
||||
#ifdef USE_AVX2
|
||||
// Perform SHA512 one block (Intel AVX2)
|
||||
sha512_one_block_avx2(chunk, s);
|
||||
#elif USE_ARMV82
|
||||
sha512_neon_core core;
|
||||
|
||||
core.ab = vld1q_u64(s);
|
||||
core.cd = vld1q_u64(s+2);
|
||||
core.ef = vld1q_u64(s+4);
|
||||
core.gh = vld1q_u64(s+6);
|
||||
|
||||
// Perform SHA512 one block (ARMv8.2)
|
||||
sha512_neon_block(&core, chunk);
|
||||
|
||||
s[0] = vgetq_lane_u64 (core.ab, 0);
|
||||
s[1] = vgetq_lane_u64 (core.ab, 1);
|
||||
s[2] = vgetq_lane_u64 (core.cd, 0);
|
||||
s[3] = vgetq_lane_u64 (core.cd, 1);
|
||||
s[4] = vgetq_lane_u64 (core.ef, 0);
|
||||
s[5] = vgetq_lane_u64 (core.ef, 1);
|
||||
s[6] = vgetq_lane_u64 (core.gh, 0);
|
||||
s[7] = vgetq_lane_u64 (core.gh, 1);
|
||||
#else
|
||||
// Perform SHA512 one block (legacy)
|
||||
uint64_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user