1 /*
2 * SHA-512 algorithm as described at
3 *
4 * http://csrc.nist.gov/cryptval/shs.html
5 *
6 * Modifications made for SHA-384 also
7 */
8
9 #include <assert.h>
10 #include "ssh.h"
11
12 /*
13 * Start by deciding whether we can support hardware SHA at all.
14 */
15 #define HW_SHA512_NONE 0
16 #define HW_SHA512_NEON 1
17
18 #ifdef _FORCE_SHA512_NEON
19 # define HW_SHA512 HW_SHA512_NEON
20 #elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
21 /* Arm can potentially support both endiannesses, but this code
22 * hasn't been tested on anything but little. If anyone wants to
23 * run big-endian, they'll need to fix it first. */
24 #elif defined __ARM_FEATURE_SHA512
25 /* If the Arm SHA-512 extension is available already, we can
26 * support NEON SHA without having to enable anything by hand */
27 # define HW_SHA512 HW_SHA512_NEON
28 #elif defined(__clang__)
29 # if __has_attribute(target) && __has_include(<arm_neon.h>) && \
30 (defined(__aarch64__))
31 /* clang can enable the crypto extension in AArch64 using
32 * __attribute__((target)) */
33 # define HW_SHA512 HW_SHA512_NEON
34 # define USE_CLANG_ATTR_TARGET_AARCH64
35 # endif
36 #endif
37
38 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
39 # undef HW_SHA512
40 # define HW_SHA512 HW_SHA512_NONE
41 #endif
42
43 /*
44 * The actual query function that asks if hardware acceleration is
45 * available.
46 */
47 static bool sha512_hw_available(void);
48
49 /*
50 * The top-level selection function, caching the results of
51 * sha512_hw_available() so it only has to run once.
52 */
sha512_hw_available_cached(void)53 static bool sha512_hw_available_cached(void)
54 {
55 static bool initialised = false;
56 static bool hw_available;
57 if (!initialised) {
58 hw_available = sha512_hw_available();
59 initialised = true;
60 }
61 return hw_available;
62 }
63
64 struct sha512_select_options {
65 const ssh_hashalg *hw, *sw;
66 };
67
sha512_select(const ssh_hashalg * alg)68 static ssh_hash *sha512_select(const ssh_hashalg *alg)
69 {
70 const struct sha512_select_options *options =
71 (const struct sha512_select_options *)alg->extra;
72
73 const ssh_hashalg *real_alg =
74 sha512_hw_available_cached() ? options->hw : options->sw;
75
76 return ssh_hash_new(real_alg);
77 }
78
79 const struct sha512_select_options ssh_sha512_select_options = {
80 &ssh_sha512_hw, &ssh_sha512_sw,
81 };
82 const struct sha512_select_options ssh_sha384_select_options = {
83 &ssh_sha384_hw, &ssh_sha384_sw,
84 };
85
86 const ssh_hashalg ssh_sha512 = {
87 .new = sha512_select,
88 .hlen = 64,
89 .blocklen = 128,
90 HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
91 .extra = &ssh_sha512_select_options,
92 };
93
94 const ssh_hashalg ssh_sha384 = {
95 .new = sha512_select,
96 .hlen = 48,
97 .blocklen = 128,
98 HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
99 .extra = &ssh_sha384_select_options,
100 };
101
102 /* ----------------------------------------------------------------------
103 * Definitions likely to be helpful to multiple implementations.
104 */
105
106 static const uint64_t sha512_initial_state[] = {
107 0x6a09e667f3bcc908ULL,
108 0xbb67ae8584caa73bULL,
109 0x3c6ef372fe94f82bULL,
110 0xa54ff53a5f1d36f1ULL,
111 0x510e527fade682d1ULL,
112 0x9b05688c2b3e6c1fULL,
113 0x1f83d9abfb41bd6bULL,
114 0x5be0cd19137e2179ULL,
115 };
116
117 static const uint64_t sha384_initial_state[] = {
118 0xcbbb9d5dc1059ed8ULL,
119 0x629a292a367cd507ULL,
120 0x9159015a3070dd17ULL,
121 0x152fecd8f70e5939ULL,
122 0x67332667ffc00b31ULL,
123 0x8eb44a8768581511ULL,
124 0xdb0c2e0d64f98fa7ULL,
125 0x47b5481dbefa4fa4ULL,
126 };
127
128 static const uint64_t sha512_round_constants[] = {
129 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
130 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
131 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
132 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
133 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
134 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
135 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
136 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
137 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
138 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
139 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
140 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
141 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
142 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
143 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
144 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
145 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
146 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
147 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
148 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
149 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
150 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
151 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
152 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
153 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
154 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
155 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
156 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
157 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
158 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
159 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
160 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
161 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
162 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
163 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
164 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
165 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
166 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
167 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
168 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
169 };
170
171 #define SHA512_ROUNDS 80
172
173 typedef struct sha512_block sha512_block;
174 struct sha512_block {
175 uint8_t block[128];
176 size_t used;
177 uint64_t lenhi, lenlo;
178 };
179
sha512_block_setup(sha512_block * blk)180 static inline void sha512_block_setup(sha512_block *blk)
181 {
182 blk->used = 0;
183 blk->lenhi = blk->lenlo = 0;
184 }
185
sha512_block_write(sha512_block * blk,const void ** vdata,size_t * len)186 static inline bool sha512_block_write(
187 sha512_block *blk, const void **vdata, size_t *len)
188 {
189 size_t blkleft = sizeof(blk->block) - blk->used;
190 size_t chunk = *len < blkleft ? *len : blkleft;
191
192 const uint8_t *p = *vdata;
193 memcpy(blk->block + blk->used, p, chunk);
194 *vdata = p + chunk;
195 *len -= chunk;
196 blk->used += chunk;
197
198 size_t chunkbits = chunk << 3;
199
200 blk->lenlo += chunkbits;
201 blk->lenhi += (blk->lenlo < chunkbits);
202
203 if (blk->used == sizeof(blk->block)) {
204 blk->used = 0;
205 return true;
206 }
207
208 return false;
209 }
210
sha512_block_pad(sha512_block * blk,BinarySink * bs)211 static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
212 {
213 uint64_t final_lenhi = blk->lenhi;
214 uint64_t final_lenlo = blk->lenlo;
215 size_t pad = 127 & (111 - blk->used);
216
217 put_byte(bs, 0x80);
218 put_padding(bs, pad, 0);
219 put_uint64(bs, final_lenhi);
220 put_uint64(bs, final_lenlo);
221
222 assert(blk->used == 0 && "Should have exactly hit a block boundary");
223 }
224
225 /* ----------------------------------------------------------------------
226 * Software implementation of SHA-512.
227 */
228
ror(uint64_t x,unsigned y)229 static inline uint64_t ror(uint64_t x, unsigned y)
230 {
231 return (x << (63 & -y)) | (x >> (63 & y));
232 }
233
Ch(uint64_t ctrl,uint64_t if1,uint64_t if0)234 static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
235 {
236 return if0 ^ (ctrl & (if1 ^ if0));
237 }
238
Maj(uint64_t x,uint64_t y,uint64_t z)239 static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
240 {
241 return (x & y) | (z & (x | y));
242 }
243
Sigma_0(uint64_t x)244 static inline uint64_t Sigma_0(uint64_t x)
245 {
246 return ror(x,28) ^ ror(x,34) ^ ror(x,39);
247 }
248
Sigma_1(uint64_t x)249 static inline uint64_t Sigma_1(uint64_t x)
250 {
251 return ror(x,14) ^ ror(x,18) ^ ror(x,41);
252 }
253
sigma_0(uint64_t x)254 static inline uint64_t sigma_0(uint64_t x)
255 {
256 return ror(x,1) ^ ror(x,8) ^ (x >> 7);
257 }
258
sigma_1(uint64_t x)259 static inline uint64_t sigma_1(uint64_t x)
260 {
261 return ror(x,19) ^ ror(x,61) ^ (x >> 6);
262 }
263
sha512_sw_round(unsigned round_index,const uint64_t * schedule,uint64_t * a,uint64_t * b,uint64_t * c,uint64_t * d,uint64_t * e,uint64_t * f,uint64_t * g,uint64_t * h)264 static inline void sha512_sw_round(
265 unsigned round_index, const uint64_t *schedule,
266 uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
267 uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
268 {
269 uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
270 sha512_round_constants[round_index] + schedule[round_index];
271
272 uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
273
274 *d += t1;
275 *h = t1 + t2;
276 }
277
sha512_sw_block(uint64_t * core,const uint8_t * block)278 static void sha512_sw_block(uint64_t *core, const uint8_t *block)
279 {
280 uint64_t w[SHA512_ROUNDS];
281 uint64_t a,b,c,d,e,f,g,h;
282
283 int t;
284
285 for (t = 0; t < 16; t++)
286 w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
287
288 for (t = 16; t < SHA512_ROUNDS; t++)
289 w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
290
291 a = core[0]; b = core[1]; c = core[2]; d = core[3];
292 e = core[4]; f = core[5]; g = core[6]; h = core[7];
293
294 for (t = 0; t < SHA512_ROUNDS; t+=8) {
295 sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
296 sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
297 sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
298 sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
299 sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
300 sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
301 sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
302 sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
303 }
304
305 core[0] += a; core[1] += b; core[2] += c; core[3] += d;
306 core[4] += e; core[5] += f; core[6] += g; core[7] += h;
307
308 smemclr(w, sizeof(w));
309 }
310
311 typedef struct sha512_sw {
312 uint64_t core[8];
313 sha512_block blk;
314 BinarySink_IMPLEMENTATION;
315 ssh_hash hash;
316 } sha512_sw;
317
318 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
319
sha512_sw_new(const ssh_hashalg * alg)320 static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
321 {
322 sha512_sw *s = snew(sha512_sw);
323
324 s->hash.vt = alg;
325 BinarySink_INIT(s, sha512_sw_write);
326 BinarySink_DELEGATE_INIT(&s->hash, s);
327 return &s->hash;
328 }
329
sha512_sw_reset(ssh_hash * hash)330 static void sha512_sw_reset(ssh_hash *hash)
331 {
332 sha512_sw *s = container_of(hash, sha512_sw, hash);
333
334 /* The 'extra' field in the ssh_hashalg indicates which
335 * initialisation vector we're using */
336 memcpy(s->core, hash->vt->extra, sizeof(s->core));
337 sha512_block_setup(&s->blk);
338 }
339
sha512_sw_copyfrom(ssh_hash * hcopy,ssh_hash * horig)340 static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
341 {
342 sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
343 sha512_sw *orig = container_of(horig, sha512_sw, hash);
344
345 memcpy(copy, orig, sizeof(*copy));
346 BinarySink_COPIED(copy);
347 BinarySink_DELEGATE_INIT(©->hash, copy);
348 }
349
sha512_sw_free(ssh_hash * hash)350 static void sha512_sw_free(ssh_hash *hash)
351 {
352 sha512_sw *s = container_of(hash, sha512_sw, hash);
353
354 smemclr(s, sizeof(*s));
355 sfree(s);
356 }
357
sha512_sw_write(BinarySink * bs,const void * vp,size_t len)358 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
359 {
360 sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
361
362 while (len > 0)
363 if (sha512_block_write(&s->blk, &vp, &len))
364 sha512_sw_block(s->core, s->blk.block);
365 }
366
sha512_sw_digest(ssh_hash * hash,uint8_t * digest)367 static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
368 {
369 sha512_sw *s = container_of(hash, sha512_sw, hash);
370
371 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
372 for (size_t i = 0; i < hash->vt->hlen / 8; i++)
373 PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
374 }
375
376 const ssh_hashalg ssh_sha512_sw = {
377 .new = sha512_sw_new,
378 .reset = sha512_sw_reset,
379 .copyfrom = sha512_sw_copyfrom,
380 .digest = sha512_sw_digest,
381 .free = sha512_sw_free,
382 .hlen = 64,
383 .blocklen = 128,
384 HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
385 .extra = sha512_initial_state,
386 };
387
388 const ssh_hashalg ssh_sha384_sw = {
389 .new = sha512_sw_new,
390 .reset = sha512_sw_reset,
391 .copyfrom = sha512_sw_copyfrom,
392 .digest = sha512_sw_digest,
393 .free = sha512_sw_free,
394 .hlen = 48,
395 .blocklen = 128,
396 HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
397 .extra = sha384_initial_state,
398 };
399
400 /* ----------------------------------------------------------------------
401 * Hardware-accelerated implementation of SHA-512 using Arm NEON.
402 */
403
404 #if HW_SHA512 == HW_SHA512_NEON
405
406 /*
407 * Manually set the target architecture, if we decided above that we
408 * need to.
409 */
410 #ifdef USE_CLANG_ATTR_TARGET_AARCH64
411 /*
412 * A spot of cheating: redefine some ACLE feature macros before
413 * including arm_neon.h. Otherwise we won't get the SHA intrinsics
414 * defined by that header, because it will be looking at the settings
415 * for the whole translation unit rather than the ones we're going to
416 * put on some particular functions using __attribute__((target)).
417 */
418 #define __ARM_NEON 1
419 #define __ARM_FEATURE_CRYPTO 1
420 #define FUNC_ISA __attribute__ ((target("neon,sha3")))
421 #endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
422
423 #ifndef FUNC_ISA
424 #define FUNC_ISA
425 #endif
426
427 #ifdef USE_ARM64_NEON_H
428 #include <arm64_neon.h>
429 #else
430 #include <arm_neon.h>
431 #endif
432
sha512_hw_available(void)433 static bool sha512_hw_available(void)
434 {
435 /*
436 * For Arm, we delegate to a per-platform detection function (see
437 * explanation in sshaes.c).
438 */
439 return platform_sha512_hw_available();
440 }
441
442 #if defined __clang__
443 /*
444 * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
445 * NEON intrinsics. So I define my own set using inline assembler, and
446 * use #define to effectively rename them over the top of the standard
447 * names.
448 *
449 * The aim of that #define technique is that it should avoid a build
450 * failure if these intrinsics _are_ defined in <arm_neon.h>.
451 * Obviously it would be better in that situation to switch back to
452 * using the real intrinsics, but until I see a version of clang that
453 * supports them, I won't know what version number to test in the
454 * ifdef.
455 */
456 static inline FUNC_ISA
vsha512su0q_u64_asm(uint64x2_t x,uint64x2_t y)457 uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
458 __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
459 return x;
460 }
461 static inline FUNC_ISA
vsha512su1q_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)462 uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
463 __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
464 return x;
465 }
466 static inline FUNC_ISA
vsha512hq_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)467 uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
468 __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
469 return x;
470 }
471 static inline FUNC_ISA
vsha512h2q_u64_asm(uint64x2_t x,uint64x2_t y,uint64x2_t z)472 uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
473 __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
474 return x;
475 }
476 #undef vsha512su0q_u64
477 #define vsha512su0q_u64 vsha512su0q_u64_asm
478 #undef vsha512su1q_u64
479 #define vsha512su1q_u64 vsha512su1q_u64_asm
480 #undef vsha512hq_u64
481 #define vsha512hq_u64 vsha512hq_u64_asm
482 #undef vsha512h2q_u64
483 #define vsha512h2q_u64 vsha512h2q_u64_asm
484 #endif /* defined __clang__ */
485
486 typedef struct sha512_neon_core sha512_neon_core;
487 struct sha512_neon_core {
488 uint64x2_t ab, cd, ef, gh;
489 };
490
491 FUNC_ISA
sha512_neon_load_input(const uint8_t * p)492 static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
493 {
494 return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
495 }
496
497 FUNC_ISA
sha512_neon_schedule_update(uint64x2_t m8,uint64x2_t m7,uint64x2_t m4,uint64x2_t m3,uint64x2_t m1)498 static inline uint64x2_t sha512_neon_schedule_update(
499 uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
500 {
501 /*
502 * vsha512su0q_u64() takes words from a long way back in the
503 * schedule and performs the sigma_0 half of the computation of
504 * the next two 64-bit message-schedule words.
505 *
506 * vsha512su1q_u64() combines the result of that with the sigma_1
507 * steps, to output the finished version of those two words. The
508 * total amount of input data it requires fits nicely into three
509 * 128-bit vector registers, but one of those registers is
510 * misaligned compared to the 128-bit chunks that the message
511 * schedule is stored in. So we use vextq_u64 to make one of its
512 * input words out of the second half of m4 and the first half of
513 * m3.
514 */
515 return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
516 }
517
518 FUNC_ISA
sha512_neon_round2(unsigned round_index,uint64x2_t schedule_words,uint64x2_t * ab,uint64x2_t * cd,uint64x2_t * ef,uint64x2_t * gh)519 static inline void sha512_neon_round2(
520 unsigned round_index, uint64x2_t schedule_words,
521 uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
522 {
523 /*
524 * vsha512hq_u64 performs the Sigma_1 and Ch half of the
525 * computation of two rounds of SHA-512 (including feeding back
526 * one of the outputs from the first of those half-rounds into the
527 * second one).
528 *
529 * vsha512h2q_u64 combines the result of that with the Sigma_0 and
530 * Maj steps, and outputs one 128-bit vector that replaces the gh
531 * piece of the input hash state, and a second that updates cd by
532 * addition.
533 *
534 * Similarly to vsha512su1q_u64 above, some of the input registers
535 * expected by these instructions are misaligned by 64 bits
536 * relative to the chunks we've divided the hash state into, so we
537 * have to start by making 'de' and 'fg' words out of our input
538 * cd,ef,gh, using vextq_u64.
539 *
540 * Also, one of the inputs to vsha512hq_u64 is expected to contain
541 * the results of summing gh + two round constants + two words of
542 * message schedule, but the two words of the message schedule
543 * have to be the opposite way round in the vector register from
544 * the way that vsha512su1q_u64 output them. Hence, there's
545 * another vextq_u64 in here that swaps the two halves of the
546 * initial_sum vector register.
547 *
548 * (This also means that I don't have to prepare a specially
549 * reordered version of the sha512_round_constants[] array: as
550 * long as I'm unavoidably doing a swap at run time _anyway_, I
551 * can load from the normally ordered version of that array, and
552 * just take care to fold in that data _before_ the swap rather
553 * than after.)
554 */
555
556 /* Load two round constants, with the first one in the low half */
557 uint64x2_t round_constants = vld1q_u64(
558 sha512_round_constants + round_index);
559
560 /* Add schedule words to round constants */
561 uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
562
563 /* Swap that sum around so the word used in the first of the two
564 * rounds is in the _high_ half of the vector, matching where h
565 * lives in the gh vector */
566 uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
567
568 /* Add gh to that, now that they're matching ways round */
569 uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
570
571 /* Make the misaligned de and fg words */
572 uint64x2_t de = vextq_u64(*cd, *ef, 1);
573 uint64x2_t fg = vextq_u64(*ef, *gh, 1);
574
575 /* Now we're ready to put all the pieces together. The output from
576 * vsha512h2q_u64 can be used directly as the new gh, and the
577 * output from vsha512hq_u64 is simultaneously the intermediate
578 * value passed to h2 and the thing you have to add on to cd. */
579 uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
580 *gh = vsha512h2q_u64(intermed, *cd, *ab);
581 *cd = vaddq_u64(*cd, intermed);
582 }
583
584 FUNC_ISA
sha512_neon_block(sha512_neon_core * core,const uint8_t * p)585 static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
586 {
587 uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
588
589 uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
590
591 s0 = sha512_neon_load_input(p + 16*0);
592 sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
593 s1 = sha512_neon_load_input(p + 16*1);
594 sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
595 s2 = sha512_neon_load_input(p + 16*2);
596 sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
597 s3 = sha512_neon_load_input(p + 16*3);
598 sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
599 s4 = sha512_neon_load_input(p + 16*4);
600 sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
601 s5 = sha512_neon_load_input(p + 16*5);
602 sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
603 s6 = sha512_neon_load_input(p + 16*6);
604 sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
605 s7 = sha512_neon_load_input(p + 16*7);
606 sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
607 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
608 sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
609 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
610 sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
611 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
612 sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
613 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
614 sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
615 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
616 sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
617 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
618 sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
619 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
620 sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
621 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
622 sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
623 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
624 sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
625 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
626 sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
627 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
628 sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
629 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
630 sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
631 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
632 sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
633 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
634 sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
635 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
636 sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
637 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
638 sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
639 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
640 sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
641 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
642 sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
643 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
644 sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
645 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
646 sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
647 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
648 sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
649 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
650 sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
651 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
652 sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
653 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
654 sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
655 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
656 sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
657 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
658 sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
659 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
660 sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
661 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
662 sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
663 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
664 sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
665 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
666 sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
667 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
668 sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
669 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
670 sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
671
672 core->ab = vaddq_u64(core->ab, ab);
673 core->cd = vaddq_u64(core->cd, cd);
674 core->ef = vaddq_u64(core->ef, ef);
675 core->gh = vaddq_u64(core->gh, gh);
676 }
677
678 typedef struct sha512_neon {
679 sha512_neon_core core;
680 sha512_block blk;
681 BinarySink_IMPLEMENTATION;
682 ssh_hash hash;
683 } sha512_neon;
684
685 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
686
sha512_neon_new(const ssh_hashalg * alg)687 static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
688 {
689 if (!sha512_hw_available_cached())
690 return NULL;
691
692 sha512_neon *s = snew(sha512_neon);
693
694 s->hash.vt = alg;
695 BinarySink_INIT(s, sha512_neon_write);
696 BinarySink_DELEGATE_INIT(&s->hash, s);
697 return &s->hash;
698 }
699
sha512_neon_reset(ssh_hash * hash)700 static void sha512_neon_reset(ssh_hash *hash)
701 {
702 sha512_neon *s = container_of(hash, sha512_neon, hash);
703 const uint64_t *iv = (const uint64_t *)hash->vt->extra;
704
705 s->core.ab = vld1q_u64(iv);
706 s->core.cd = vld1q_u64(iv+2);
707 s->core.ef = vld1q_u64(iv+4);
708 s->core.gh = vld1q_u64(iv+6);
709
710 sha512_block_setup(&s->blk);
711 }
712
sha512_neon_copyfrom(ssh_hash * hcopy,ssh_hash * horig)713 static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
714 {
715 sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
716 sha512_neon *orig = container_of(horig, sha512_neon, hash);
717
718 *copy = *orig; /* structure copy */
719
720 BinarySink_COPIED(copy);
721 BinarySink_DELEGATE_INIT(©->hash, copy);
722 }
723
sha512_neon_free(ssh_hash * hash)724 static void sha512_neon_free(ssh_hash *hash)
725 {
726 sha512_neon *s = container_of(hash, sha512_neon, hash);
727 smemclr(s, sizeof(*s));
728 sfree(s);
729 }
730
sha512_neon_write(BinarySink * bs,const void * vp,size_t len)731 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
732 {
733 sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
734
735 while (len > 0)
736 if (sha512_block_write(&s->blk, &vp, &len))
737 sha512_neon_block(&s->core, s->blk.block);
738 }
739
sha512_neon_digest(ssh_hash * hash,uint8_t * digest)740 static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
741 {
742 sha512_neon *s = container_of(hash, sha512_neon, hash);
743
744 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
745
746 vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
747 vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
748 vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
749 vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
750 }
751
sha384_neon_digest(ssh_hash * hash,uint8_t * digest)752 static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
753 {
754 sha512_neon *s = container_of(hash, sha512_neon, hash);
755
756 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
757
758 vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
759 vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
760 vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
761 }
762
763 const ssh_hashalg ssh_sha512_hw = {
764 .new = sha512_neon_new,
765 .reset = sha512_neon_reset,
766 .copyfrom = sha512_neon_copyfrom,
767 .digest = sha512_neon_digest,
768 .free = sha512_neon_free,
769 .hlen = 64,
770 .blocklen = 128,
771 HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
772 .extra = sha512_initial_state,
773 };
774
775 const ssh_hashalg ssh_sha384_hw = {
776 .new = sha512_neon_new,
777 .reset = sha512_neon_reset,
778 .copyfrom = sha512_neon_copyfrom,
779 .digest = sha384_neon_digest,
780 .free = sha512_neon_free,
781 .hlen = 48,
782 .blocklen = 128,
783 HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
784 .extra = sha384_initial_state,
785 };
786
787 /* ----------------------------------------------------------------------
788 * Stub functions if we have no hardware-accelerated SHA-512. In this
789 * case, sha512_hw_new returns NULL (though it should also never be
790 * selected by sha512_select, so the only thing that should even be
791 * _able_ to call it is testcrypt). As a result, the remaining vtable
792 * functions should never be called at all.
793 */
794
795 #elif HW_SHA512 == HW_SHA512_NONE
796
sha512_hw_available(void)797 static bool sha512_hw_available(void)
798 {
799 return false;
800 }
801
sha512_stub_new(const ssh_hashalg * alg)802 static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
803 {
804 return NULL;
805 }
806
807 #define STUB_BODY { unreachable("Should never be called"); }
808
809 static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
810 static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
811 static void sha512_stub_free(ssh_hash *hash) STUB_BODY
812 static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
813
814 const ssh_hashalg ssh_sha512_hw = {
815 .new = sha512_stub_new,
816 .reset = sha512_stub_reset,
817 .copyfrom = sha512_stub_copyfrom,
818 .digest = sha512_stub_digest,
819 .free = sha512_stub_free,
820 .hlen = 64,
821 .blocklen = 128,
822 HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
823 };
824
825 const ssh_hashalg ssh_sha384_hw = {
826 .new = sha512_stub_new,
827 .reset = sha512_stub_reset,
828 .copyfrom = sha512_stub_copyfrom,
829 .digest = sha512_stub_digest,
830 .free = sha512_stub_free,
831 .hlen = 48,
832 .blocklen = 128,
833 HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
834 };
835
836 #endif /* HW_SHA512 */
837