1 /*
2  * SHA-512 algorithm as described at
3  *
4  *   http://csrc.nist.gov/cryptval/shs.html
5  *
6  * Modifications made for SHA-384 also
7  */
8 
9 #include <assert.h>
10 #include "ssh.h"
11 
12 /*
13  * Start by deciding whether we can support hardware SHA at all.
14  */
15 #define HW_SHA512_NONE 0
16 #define HW_SHA512_NEON 1
17 
18 #ifdef _FORCE_SHA512_NEON
19 #   define HW_SHA512 HW_SHA512_NEON
20 #elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
21     /* Arm can potentially support both endiannesses, but this code
22      * hasn't been tested on anything but little. If anyone wants to
23      * run big-endian, they'll need to fix it first. */
24 #elif defined __ARM_FEATURE_SHA512
25     /* If the Arm SHA-512 extension is available already, we can
26      * support NEON SHA without having to enable anything by hand */
27 #   define HW_SHA512 HW_SHA512_NEON
28 #elif defined(__clang__)
29 #   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
30     (defined(__aarch64__))
31         /* clang can enable the crypto extension in AArch64 using
32          * __attribute__((target)) */
33 #       define HW_SHA512 HW_SHA512_NEON
34 #       define USE_CLANG_ATTR_TARGET_AARCH64
35 #   endif
36 #endif
37 
38 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
39 #   undef HW_SHA512
40 #   define HW_SHA512 HW_SHA512_NONE
41 #endif
42 
43 /*
44  * The actual query function that asks if hardware acceleration is
45  * available.
46  */
47 static bool sha512_hw_available(void);
48 
49 /*
50  * The top-level selection function, caching the results of
51  * sha512_hw_available() so it only has to run once.
52  */
sha512_hw_available_cached(void)53 static bool sha512_hw_available_cached(void)
54 {
55     static bool initialised = false;
56     static bool hw_available;
57     if (!initialised) {
58         hw_available = sha512_hw_available();
59         initialised = true;
60     }
61     return hw_available;
62 }
63 
64 struct sha512_select_options {
65     const ssh_hashalg *hw, *sw;
66 };
67 
sha512_select(const ssh_hashalg * alg)68 static ssh_hash *sha512_select(const ssh_hashalg *alg)
69 {
70     const struct sha512_select_options *options =
71         (const struct sha512_select_options *)alg->extra;
72 
73     const ssh_hashalg *real_alg =
74         sha512_hw_available_cached() ? options->hw : options->sw;
75 
76     return ssh_hash_new(real_alg);
77 }
78 
79 const struct sha512_select_options ssh_sha512_select_options = {
80     &ssh_sha512_hw, &ssh_sha512_sw,
81 };
82 const struct sha512_select_options ssh_sha384_select_options = {
83     &ssh_sha384_hw, &ssh_sha384_sw,
84 };
85 
86 const ssh_hashalg ssh_sha512 = {
87     .new = sha512_select,
88     .hlen = 64,
89     .blocklen = 128,
90     HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
91     .extra = &ssh_sha512_select_options,
92 };
93 
94 const ssh_hashalg ssh_sha384 = {
95     .new = sha512_select,
96     .hlen = 48,
97     .blocklen = 128,
98     HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
99     .extra = &ssh_sha384_select_options,
100 };
101 
102 /* ----------------------------------------------------------------------
103  * Definitions likely to be helpful to multiple implementations.
104  */
105 
106 static const uint64_t sha512_initial_state[] = {
107     0x6a09e667f3bcc908ULL,
108     0xbb67ae8584caa73bULL,
109     0x3c6ef372fe94f82bULL,
110     0xa54ff53a5f1d36f1ULL,
111     0x510e527fade682d1ULL,
112     0x9b05688c2b3e6c1fULL,
113     0x1f83d9abfb41bd6bULL,
114     0x5be0cd19137e2179ULL,
115 };
116 
117 static const uint64_t sha384_initial_state[] = {
118     0xcbbb9d5dc1059ed8ULL,
119     0x629a292a367cd507ULL,
120     0x9159015a3070dd17ULL,
121     0x152fecd8f70e5939ULL,
122     0x67332667ffc00b31ULL,
123     0x8eb44a8768581511ULL,
124     0xdb0c2e0d64f98fa7ULL,
125     0x47b5481dbefa4fa4ULL,
126 };
127 
128 static const uint64_t sha512_round_constants[] = {
129     0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
130     0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
131     0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
132     0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
133     0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
134     0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
135     0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
136     0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
137     0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
138     0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
139     0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
140     0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
141     0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
142     0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
143     0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
144     0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
145     0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
146     0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
147     0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
148     0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
149     0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
150     0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
151     0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
152     0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
153     0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
154     0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
155     0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
156     0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
157     0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
158     0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
159     0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
160     0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
161     0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
162     0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
163     0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
164     0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
165     0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
166     0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
167     0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
168     0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
169 };
170 
171 #define SHA512_ROUNDS 80
172 
173 typedef struct sha512_block sha512_block;
174 struct sha512_block {
175     uint8_t block[128];
176     size_t used;
177     uint64_t lenhi, lenlo;
178 };
179 
sha512_block_setup(sha512_block * blk)180 static inline void sha512_block_setup(sha512_block *blk)
181 {
182     blk->used = 0;
183     blk->lenhi = blk->lenlo = 0;
184 }
185 
sha512_block_write(sha512_block * blk,const void ** vdata,size_t * len)186 static inline bool sha512_block_write(
187     sha512_block *blk, const void **vdata, size_t *len)
188 {
189     size_t blkleft = sizeof(blk->block) - blk->used;
190     size_t chunk = *len < blkleft ? *len : blkleft;
191 
192     const uint8_t *p = *vdata;
193     memcpy(blk->block + blk->used, p, chunk);
194     *vdata = p + chunk;
195     *len -= chunk;
196     blk->used += chunk;
197 
198     size_t chunkbits = chunk << 3;
199 
200     blk->lenlo += chunkbits;
201     blk->lenhi += (blk->lenlo < chunkbits);
202 
203     if (blk->used == sizeof(blk->block)) {
204         blk->used = 0;
205         return true;
206     }
207 
208     return false;
209 }
210 
sha512_block_pad(sha512_block * blk,BinarySink * bs)211 static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
212 {
213     uint64_t final_lenhi = blk->lenhi;
214     uint64_t final_lenlo = blk->lenlo;
215     size_t pad = 127 & (111 - blk->used);
216 
217     put_byte(bs, 0x80);
218     put_padding(bs, pad, 0);
219     put_uint64(bs, final_lenhi);
220     put_uint64(bs, final_lenlo);
221 
222     assert(blk->used == 0 && "Should have exactly hit a block boundary");
223 }
224 
225 /* ----------------------------------------------------------------------
226  * Software implementation of SHA-512.
227  */
228 
ror(uint64_t x,unsigned y)229 static inline uint64_t ror(uint64_t x, unsigned y)
230 {
231     return (x << (63 & -y)) | (x >> (63 & y));
232 }
233 
Ch(uint64_t ctrl,uint64_t if1,uint64_t if0)234 static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
235 {
236     return if0 ^ (ctrl & (if1 ^ if0));
237 }
238 
Maj(uint64_t x,uint64_t y,uint64_t z)239 static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
240 {
241     return (x & y) | (z & (x | y));
242 }
243 
Sigma_0(uint64_t x)244 static inline uint64_t Sigma_0(uint64_t x)
245 {
246     return ror(x,28) ^ ror(x,34) ^ ror(x,39);
247 }
248 
Sigma_1(uint64_t x)249 static inline uint64_t Sigma_1(uint64_t x)
250 {
251     return ror(x,14) ^ ror(x,18) ^ ror(x,41);
252 }
253 
sigma_0(uint64_t x)254 static inline uint64_t sigma_0(uint64_t x)
255 {
256     return ror(x,1) ^ ror(x,8) ^ (x >> 7);
257 }
258 
sigma_1(uint64_t x)259 static inline uint64_t sigma_1(uint64_t x)
260 {
261     return ror(x,19) ^ ror(x,61) ^ (x >> 6);
262 }
263 
sha512_sw_round(unsigned round_index,const uint64_t * schedule,uint64_t * a,uint64_t * b,uint64_t * c,uint64_t * d,uint64_t * e,uint64_t * f,uint64_t * g,uint64_t * h)264 static inline void sha512_sw_round(
265     unsigned round_index, const uint64_t *schedule,
266     uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
267     uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
268 {
269     uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
270         sha512_round_constants[round_index] + schedule[round_index];
271 
272     uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
273 
274     *d += t1;
275     *h = t1 + t2;
276 }
277 
sha512_sw_block(uint64_t * core,const uint8_t * block)278 static void sha512_sw_block(uint64_t *core, const uint8_t *block)
279 {
280     uint64_t w[SHA512_ROUNDS];
281     uint64_t a,b,c,d,e,f,g,h;
282 
283     int t;
284 
285     for (t = 0; t < 16; t++)
286         w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
287 
288     for (t = 16; t < SHA512_ROUNDS; t++)
289         w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
290 
291     a = core[0]; b = core[1]; c = core[2]; d = core[3];
292     e = core[4]; f = core[5]; g = core[6]; h = core[7];
293 
294     for (t = 0; t < SHA512_ROUNDS; t+=8) {
295         sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
296         sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
297         sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
298         sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
299         sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
300         sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
301         sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
302         sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
303     }
304 
305     core[0] += a; core[1] += b; core[2] += c; core[3] += d;
306     core[4] += e; core[5] += f; core[6] += g; core[7] += h;
307 
308     smemclr(w, sizeof(w));
309 }
310 
311 typedef struct sha512_sw {
312     uint64_t core[8];
313     sha512_block blk;
314     BinarySink_IMPLEMENTATION;
315     ssh_hash hash;
316 } sha512_sw;
317 
318 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
319 
sha512_sw_new(const ssh_hashalg * alg)320 static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
321 {
322     sha512_sw *s = snew(sha512_sw);
323 
324     s->hash.vt = alg;
325     BinarySink_INIT(s, sha512_sw_write);
326     BinarySink_DELEGATE_INIT(&s->hash, s);
327     return &s->hash;
328 }
329 
sha512_sw_reset(ssh_hash * hash)330 static void sha512_sw_reset(ssh_hash *hash)
331 {
332     sha512_sw *s = container_of(hash, sha512_sw, hash);
333 
334     /* The 'extra' field in the ssh_hashalg indicates which
335      * initialisation vector we're using */
336     memcpy(s->core, hash->vt->extra, sizeof(s->core));
337     sha512_block_setup(&s->blk);
338 }
339 
sha512_sw_copyfrom(ssh_hash * hcopy,ssh_hash * horig)340 static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
341 {
342     sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
343     sha512_sw *orig = container_of(horig, sha512_sw, hash);
344 
345     memcpy(copy, orig, sizeof(*copy));
346     BinarySink_COPIED(copy);
347     BinarySink_DELEGATE_INIT(&copy->hash, copy);
348 }
349 
sha512_sw_free(ssh_hash * hash)350 static void sha512_sw_free(ssh_hash *hash)
351 {
352     sha512_sw *s = container_of(hash, sha512_sw, hash);
353 
354     smemclr(s, sizeof(*s));
355     sfree(s);
356 }
357 
sha512_sw_write(BinarySink * bs,const void * vp,size_t len)358 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
359 {
360     sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
361 
362     while (len > 0)
363         if (sha512_block_write(&s->blk, &vp, &len))
364             sha512_sw_block(s->core, s->blk.block);
365 }
366 
sha512_sw_digest(ssh_hash * hash,uint8_t * digest)367 static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
368 {
369     sha512_sw *s = container_of(hash, sha512_sw, hash);
370 
371     sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
372     for (size_t i = 0; i < hash->vt->hlen / 8; i++)
373         PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
374 }
375 
376 const ssh_hashalg ssh_sha512_sw = {
377     .new = sha512_sw_new,
378     .reset = sha512_sw_reset,
379     .copyfrom = sha512_sw_copyfrom,
380     .digest = sha512_sw_digest,
381     .free = sha512_sw_free,
382     .hlen = 64,
383     .blocklen = 128,
384     HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
385     .extra = sha512_initial_state,
386 };
387 
388 const ssh_hashalg ssh_sha384_sw = {
389     .new = sha512_sw_new,
390     .reset = sha512_sw_reset,
391     .copyfrom = sha512_sw_copyfrom,
392     .digest = sha512_sw_digest,
393     .free = sha512_sw_free,
394     .hlen = 48,
395     .blocklen = 128,
396     HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
397     .extra = sha384_initial_state,
398 };
399 
400 /* ----------------------------------------------------------------------
401  * Hardware-accelerated implementation of SHA-512 using Arm NEON.
402  */
403 
404 #if HW_SHA512 == HW_SHA512_NEON
405 
406 /*
407  * Manually set the target architecture, if we decided above that we
408  * need to.
409  */
410 #ifdef USE_CLANG_ATTR_TARGET_AARCH64
411 /*
412  * A spot of cheating: redefine some ACLE feature macros before
413  * including arm_neon.h. Otherwise we won't get the SHA intrinsics
414  * defined by that header, because it will be looking at the settings
415  * for the whole translation unit rather than the ones we're going to
416  * put on some particular functions using __attribute__((target)).
417  */
418 #define __ARM_NEON 1
419 #define __ARM_FEATURE_CRYPTO 1
420 #define FUNC_ISA __attribute__ ((target("neon,sha3")))
421 #endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
422 
423 #ifndef FUNC_ISA
424 #define FUNC_ISA
425 #endif
426 
427 #ifdef USE_ARM64_NEON_H
428 #include <arm64_neon.h>
429 #else
430 #include <arm_neon.h>
431 #endif
432 
sha512_hw_available(void)433 static bool sha512_hw_available(void)
434 {
435     /*
436      * For Arm, we delegate to a per-platform detection function (see
437      * explanation in sshaes.c).
438      */
439     return platform_sha512_hw_available();
440 }
441 
442 #if defined __clang__
443 /*
444  * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
445  * NEON intrinsics. So I define my own set using inline assembler, and
446  * use #define to effectively rename them over the top of the standard
447  * names.
448  *
449  * The aim of that #define technique is that it should avoid a build
450  * failure if these intrinsics _are_ defined in <arm_neon.h>.
451  * Obviously it would be better in that situation to switch back to
452  * using the real intrinsics, but until I see a version of clang that
453  * supports them, I won't know what version number to test in the
454  * ifdef.
455  */
456 static inline FUNC_ISA
vsha512su0q_u64_asm(uint64x2_t x,uint64x2_t y)457 uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
458     __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
459     return x;
460 }
461 static inline FUNC_ISA
vsha512su1q_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)462 uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
463     __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
464     return x;
465 }
466 static inline FUNC_ISA
vsha512hq_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)467 uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
468     __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
469     return x;
470 }
471 static inline FUNC_ISA
vsha512h2q_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)472 uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
473     __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
474     return x;
475 }
476 #undef vsha512su0q_u64
477 #define vsha512su0q_u64 vsha512su0q_u64_asm
478 #undef vsha512su1q_u64
479 #define vsha512su1q_u64 vsha512su1q_u64_asm
480 #undef vsha512hq_u64
481 #define vsha512hq_u64 vsha512hq_u64_asm
482 #undef vsha512h2q_u64
483 #define vsha512h2q_u64 vsha512h2q_u64_asm
484 #endif /* defined __clang__ */
485 
486 typedef struct sha512_neon_core sha512_neon_core;
487 struct sha512_neon_core {
488     uint64x2_t ab, cd, ef, gh;
489 };
490 
491 FUNC_ISA
sha512_neon_load_input(const uint8_t * p)492 static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
493 {
494     return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
495 }
496 
497 FUNC_ISA
sha512_neon_schedule_update(uint64x2_t m8,uint64x2_t m7,uint64x2_t m4,uint64x2_t m3,uint64x2_t m1)498 static inline uint64x2_t sha512_neon_schedule_update(
499     uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
500 {
501     /*
502      * vsha512su0q_u64() takes words from a long way back in the
503      * schedule and performs the sigma_0 half of the computation of
504      * the next two 64-bit message-schedule words.
505      *
506      * vsha512su1q_u64() combines the result of that with the sigma_1
507      * steps, to output the finished version of those two words. The
508      * total amount of input data it requires fits nicely into three
509      * 128-bit vector registers, but one of those registers is
510      * misaligned compared to the 128-bit chunks that the message
511      * schedule is stored in. So we use vextq_u64 to make one of its
512      * input words out of the second half of m4 and the first half of
513      * m3.
514      */
515     return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
516 }
517 
518 FUNC_ISA
sha512_neon_round2(unsigned round_index,uint64x2_t schedule_words,uint64x2_t * ab,uint64x2_t * cd,uint64x2_t * ef,uint64x2_t * gh)519 static inline void sha512_neon_round2(
520     unsigned round_index, uint64x2_t schedule_words,
521     uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
522 {
523     /*
524      * vsha512hq_u64 performs the Sigma_1 and Ch half of the
525      * computation of two rounds of SHA-512 (including feeding back
526      * one of the outputs from the first of those half-rounds into the
527      * second one).
528      *
529      * vsha512h2q_u64 combines the result of that with the Sigma_0 and
530      * Maj steps, and outputs one 128-bit vector that replaces the gh
531      * piece of the input hash state, and a second that updates cd by
532      * addition.
533      *
534      * Similarly to vsha512su1q_u64 above, some of the input registers
535      * expected by these instructions are misaligned by 64 bits
536      * relative to the chunks we've divided the hash state into, so we
537      * have to start by making 'de' and 'fg' words out of our input
538      * cd,ef,gh, using vextq_u64.
539      *
540      * Also, one of the inputs to vsha512hq_u64 is expected to contain
541      * the results of summing gh + two round constants + two words of
542      * message schedule, but the two words of the message schedule
543      * have to be the opposite way round in the vector register from
544      * the way that vsha512su1q_u64 output them. Hence, there's
545      * another vextq_u64 in here that swaps the two halves of the
546      * initial_sum vector register.
547      *
548      * (This also means that I don't have to prepare a specially
549      * reordered version of the sha512_round_constants[] array: as
550      * long as I'm unavoidably doing a swap at run time _anyway_, I
551      * can load from the normally ordered version of that array, and
552      * just take care to fold in that data _before_ the swap rather
553      * than after.)
554      */
555 
556     /* Load two round constants, with the first one in the low half */
557     uint64x2_t round_constants = vld1q_u64(
558         sha512_round_constants + round_index);
559 
560     /* Add schedule words to round constants */
561     uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
562 
563     /* Swap that sum around so the word used in the first of the two
564      * rounds is in the _high_ half of the vector, matching where h
565      * lives in the gh vector */
566     uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
567 
568     /* Add gh to that, now that they're matching ways round */
569     uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
570 
571     /* Make the misaligned de and fg words */
572     uint64x2_t de = vextq_u64(*cd, *ef, 1);
573     uint64x2_t fg = vextq_u64(*ef, *gh, 1);
574 
575     /* Now we're ready to put all the pieces together. The output from
576      * vsha512h2q_u64 can be used directly as the new gh, and the
577      * output from vsha512hq_u64 is simultaneously the intermediate
578      * value passed to h2 and the thing you have to add on to cd. */
579     uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
580     *gh = vsha512h2q_u64(intermed, *cd, *ab);
581     *cd = vaddq_u64(*cd, intermed);
582 }
583 
584 FUNC_ISA
sha512_neon_block(sha512_neon_core * core,const uint8_t * p)585 static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
586 {
587     uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
588 
589     uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
590 
591     s0 = sha512_neon_load_input(p + 16*0);
592     sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
593     s1 = sha512_neon_load_input(p + 16*1);
594     sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
595     s2 = sha512_neon_load_input(p + 16*2);
596     sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
597     s3 = sha512_neon_load_input(p + 16*3);
598     sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
599     s4 = sha512_neon_load_input(p + 16*4);
600     sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
601     s5 = sha512_neon_load_input(p + 16*5);
602     sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
603     s6 = sha512_neon_load_input(p + 16*6);
604     sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
605     s7 = sha512_neon_load_input(p + 16*7);
606     sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
607     s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
608     sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
609     s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
610     sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
611     s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
612     sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
613     s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
614     sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
615     s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
616     sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
617     s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
618     sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
619     s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
620     sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
621     s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
622     sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
623     s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
624     sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
625     s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
626     sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
627     s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
628     sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
629     s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
630     sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
631     s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
632     sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
633     s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
634     sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
635     s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
636     sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
637     s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
638     sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
639     s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
640     sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
641     s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
642     sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
643     s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
644     sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
645     s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
646     sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
647     s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
648     sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
649     s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
650     sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
651     s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
652     sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
653     s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
654     sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
655     s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
656     sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
657     s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
658     sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
659     s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
660     sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
661     s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
662     sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
663     s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
664     sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
665     s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
666     sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
667     s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
668     sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
669     s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
670     sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
671 
672     core->ab = vaddq_u64(core->ab, ab);
673     core->cd = vaddq_u64(core->cd, cd);
674     core->ef = vaddq_u64(core->ef, ef);
675     core->gh = vaddq_u64(core->gh, gh);
676 }
677 
678 typedef struct sha512_neon {
679     sha512_neon_core core;
680     sha512_block blk;
681     BinarySink_IMPLEMENTATION;
682     ssh_hash hash;
683 } sha512_neon;
684 
685 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
686 
sha512_neon_new(const ssh_hashalg * alg)687 static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
688 {
689     if (!sha512_hw_available_cached())
690         return NULL;
691 
692     sha512_neon *s = snew(sha512_neon);
693 
694     s->hash.vt = alg;
695     BinarySink_INIT(s, sha512_neon_write);
696     BinarySink_DELEGATE_INIT(&s->hash, s);
697     return &s->hash;
698 }
699 
sha512_neon_reset(ssh_hash * hash)700 static void sha512_neon_reset(ssh_hash *hash)
701 {
702     sha512_neon *s = container_of(hash, sha512_neon, hash);
703     const uint64_t *iv = (const uint64_t *)hash->vt->extra;
704 
705     s->core.ab = vld1q_u64(iv);
706     s->core.cd = vld1q_u64(iv+2);
707     s->core.ef = vld1q_u64(iv+4);
708     s->core.gh = vld1q_u64(iv+6);
709 
710     sha512_block_setup(&s->blk);
711 }
712 
sha512_neon_copyfrom(ssh_hash * hcopy,ssh_hash * horig)713 static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
714 {
715     sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
716     sha512_neon *orig = container_of(horig, sha512_neon, hash);
717 
718     *copy = *orig; /* structure copy */
719 
720     BinarySink_COPIED(copy);
721     BinarySink_DELEGATE_INIT(&copy->hash, copy);
722 }
723 
sha512_neon_free(ssh_hash * hash)724 static void sha512_neon_free(ssh_hash *hash)
725 {
726     sha512_neon *s = container_of(hash, sha512_neon, hash);
727     smemclr(s, sizeof(*s));
728     sfree(s);
729 }
730 
sha512_neon_write(BinarySink * bs,const void * vp,size_t len)731 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
732 {
733     sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
734 
735     while (len > 0)
736         if (sha512_block_write(&s->blk, &vp, &len))
737             sha512_neon_block(&s->core, s->blk.block);
738 }
739 
sha512_neon_digest(ssh_hash * hash,uint8_t * digest)740 static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
741 {
742     sha512_neon *s = container_of(hash, sha512_neon, hash);
743 
744     sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
745 
746     vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
747     vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
748     vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
749     vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
750 }
751 
sha384_neon_digest(ssh_hash * hash,uint8_t * digest)752 static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
753 {
754     sha512_neon *s = container_of(hash, sha512_neon, hash);
755 
756     sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
757 
758     vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
759     vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
760     vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
761 }
762 
763 const ssh_hashalg ssh_sha512_hw = {
764     .new = sha512_neon_new,
765     .reset = sha512_neon_reset,
766     .copyfrom = sha512_neon_copyfrom,
767     .digest = sha512_neon_digest,
768     .free = sha512_neon_free,
769     .hlen = 64,
770     .blocklen = 128,
771     HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
772     .extra = sha512_initial_state,
773 };
774 
775 const ssh_hashalg ssh_sha384_hw = {
776     .new = sha512_neon_new,
777     .reset = sha512_neon_reset,
778     .copyfrom = sha512_neon_copyfrom,
779     .digest = sha384_neon_digest,
780     .free = sha512_neon_free,
781     .hlen = 48,
782     .blocklen = 128,
783     HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
784     .extra = sha384_initial_state,
785 };
786 
787 /* ----------------------------------------------------------------------
788  * Stub functions if we have no hardware-accelerated SHA-512. In this
789  * case, sha512_hw_new returns NULL (though it should also never be
790  * selected by sha512_select, so the only thing that should even be
791  * _able_ to call it is testcrypt). As a result, the remaining vtable
792  * functions should never be called at all.
793  */
794 
795 #elif HW_SHA512 == HW_SHA512_NONE
796 
sha512_hw_available(void)797 static bool sha512_hw_available(void)
798 {
799     return false;
800 }
801 
sha512_stub_new(const ssh_hashalg * alg)802 static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
803 {
804     return NULL;
805 }
806 
807 #define STUB_BODY { unreachable("Should never be called"); }
808 
809 static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
810 static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
811 static void sha512_stub_free(ssh_hash *hash) STUB_BODY
812 static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
813 
814 const ssh_hashalg ssh_sha512_hw = {
815     .new = sha512_stub_new,
816     .reset = sha512_stub_reset,
817     .copyfrom = sha512_stub_copyfrom,
818     .digest = sha512_stub_digest,
819     .free = sha512_stub_free,
820     .hlen = 64,
821     .blocklen = 128,
822     HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
823 };
824 
825 const ssh_hashalg ssh_sha384_hw = {
826     .new = sha512_stub_new,
827     .reset = sha512_stub_reset,
828     .copyfrom = sha512_stub_copyfrom,
829     .digest = sha512_stub_digest,
830     .free = sha512_stub_free,
831     .hlen = 48,
832     .blocklen = 128,
833     HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
834 };
835 
836 #endif /* HW_SHA512 */
837