1 /*
2 * SHA-256 algorithm as described at
3 *
4 * http://csrc.nist.gov/cryptval/shs.html
5 */
6
7 #include "ssh.h"
8 #include <assert.h>
9
10 /*
11 * Start by deciding whether we can support hardware SHA at all.
12 */
13 #define HW_SHA256_NONE 0
14 #define HW_SHA256_NI 1
15 #define HW_SHA256_NEON 2
16
17 #ifdef _FORCE_SHA_NI
18 # define HW_SHA256 HW_SHA256_NI
19 #elif defined(__clang__)
20 # if __has_attribute(target) && __has_include(<wmmintrin.h>) && \
21 (defined(__x86_64__) || defined(__i386))
22 # define HW_SHA256 HW_SHA256_NI
23 # endif
24 #elif defined(__GNUC__)
25 # if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
26 (defined(__x86_64__) || defined(__i386))
27 # define HW_SHA256 HW_SHA256_NI
28 # endif
29 #elif defined (_MSC_VER)
30 # if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
31 # define HW_SHA256 HW_SHA256_NI
32 # endif
33 #endif
34
35 #ifdef _FORCE_SHA_NEON
36 # define HW_SHA256 HW_SHA256_NEON
37 #elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
38 /* Arm can potentially support both endiannesses, but this code
39 * hasn't been tested on anything but little. If anyone wants to
40 * run big-endian, they'll need to fix it first. */
41 #elif defined __ARM_FEATURE_CRYPTO
42 /* If the Arm crypto extension is available already, we can
43 * support NEON SHA without having to enable anything by hand */
44 # define HW_SHA256 HW_SHA256_NEON
45 #elif defined(__clang__)
46 # if __has_attribute(target) && __has_include(<arm_neon.h>) && \
47 (defined(__aarch64__))
48 /* clang can enable the crypto extension in AArch64 using
49 * __attribute__((target)) */
50 # define HW_SHA256 HW_SHA256_NEON
51 # define USE_CLANG_ATTR_TARGET_AARCH64
52 # endif
53 #elif defined _MSC_VER
54 /* Visual Studio supports the crypto extension when targeting
55 * AArch64, but as of VS2017, the AArch32 header doesn't quite
56 * manage it (declaring the shae/shad intrinsics without a round
57 * key operand). */
58 # if defined _M_ARM64
59 # define HW_SHA256 HW_SHA256_NEON
60 # if defined _M_ARM64
61 # define USE_ARM64_NEON_H /* unusual header name in this case */
62 # endif
63 # endif
64 #endif
65
66 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
67 # undef HW_SHA256
68 # define HW_SHA256 HW_SHA256_NONE
69 #endif
70
71 /*
72 * The actual query function that asks if hardware acceleration is
73 * available.
74 */
75 static bool sha256_hw_available(void);
76
77 /*
78 * The top-level selection function, caching the results of
79 * sha256_hw_available() so it only has to run once.
80 */
sha256_hw_available_cached(void)81 static bool sha256_hw_available_cached(void)
82 {
83 static bool initialised = false;
84 static bool hw_available;
85 if (!initialised) {
86 hw_available = sha256_hw_available();
87 initialised = true;
88 }
89 return hw_available;
90 }
91
sha256_select(const ssh_hashalg * alg)92 static ssh_hash *sha256_select(const ssh_hashalg *alg)
93 {
94 const ssh_hashalg *real_alg =
95 sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
96
97 return ssh_hash_new(real_alg);
98 }
99
100 const ssh_hashalg ssh_sha256 = {
101 .new = sha256_select,
102 .hlen = 32,
103 .blocklen = 64,
104 HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
105 };
106
107 /* ----------------------------------------------------------------------
108 * Definitions likely to be helpful to multiple implementations.
109 */
110
111 static const uint32_t sha256_initial_state[] = {
112 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
113 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
114 };
115
116 static const uint32_t sha256_round_constants[] = {
117 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
118 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
119 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
120 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
121 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
122 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
123 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
124 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
125 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
126 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
127 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
128 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
129 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
130 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
131 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
132 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
133 };
134
135 #define SHA256_ROUNDS 64
136
137 typedef struct sha256_block sha256_block;
138 struct sha256_block {
139 uint8_t block[64];
140 size_t used;
141 uint64_t len;
142 };
143
sha256_block_setup(sha256_block * blk)144 static inline void sha256_block_setup(sha256_block *blk)
145 {
146 blk->used = 0;
147 blk->len = 0;
148 }
149
sha256_block_write(sha256_block * blk,const void ** vdata,size_t * len)150 static inline bool sha256_block_write(
151 sha256_block *blk, const void **vdata, size_t *len)
152 {
153 size_t blkleft = sizeof(blk->block) - blk->used;
154 size_t chunk = *len < blkleft ? *len : blkleft;
155
156 const uint8_t *p = *vdata;
157 memcpy(blk->block + blk->used, p, chunk);
158 *vdata = p + chunk;
159 *len -= chunk;
160 blk->used += chunk;
161 blk->len += chunk;
162
163 if (blk->used == sizeof(blk->block)) {
164 blk->used = 0;
165 return true;
166 }
167
168 return false;
169 }
170
sha256_block_pad(sha256_block * blk,BinarySink * bs)171 static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
172 {
173 uint64_t final_len = blk->len << 3;
174 size_t pad = 1 + (63 & (55 - blk->used));
175
176 put_byte(bs, 0x80);
177 for (size_t i = 1; i < pad; i++)
178 put_byte(bs, 0);
179 put_uint64(bs, final_len);
180
181 assert(blk->used == 0 && "Should have exactly hit a block boundary");
182 }
183
184 /* ----------------------------------------------------------------------
185 * Software implementation of SHA-256.
186 */
187
ror(uint32_t x,unsigned y)188 static inline uint32_t ror(uint32_t x, unsigned y)
189 {
190 return (x << (31 & -y)) | (x >> (31 & y));
191 }
192
Ch(uint32_t ctrl,uint32_t if1,uint32_t if0)193 static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
194 {
195 return if0 ^ (ctrl & (if1 ^ if0));
196 }
197
Maj(uint32_t x,uint32_t y,uint32_t z)198 static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
199 {
200 return (x & y) | (z & (x | y));
201 }
202
Sigma_0(uint32_t x)203 static inline uint32_t Sigma_0(uint32_t x)
204 {
205 return ror(x,2) ^ ror(x,13) ^ ror(x,22);
206 }
207
Sigma_1(uint32_t x)208 static inline uint32_t Sigma_1(uint32_t x)
209 {
210 return ror(x,6) ^ ror(x,11) ^ ror(x,25);
211 }
212
sigma_0(uint32_t x)213 static inline uint32_t sigma_0(uint32_t x)
214 {
215 return ror(x,7) ^ ror(x,18) ^ (x >> 3);
216 }
217
sigma_1(uint32_t x)218 static inline uint32_t sigma_1(uint32_t x)
219 {
220 return ror(x,17) ^ ror(x,19) ^ (x >> 10);
221 }
222
sha256_sw_round(unsigned round_index,const uint32_t * schedule,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d,uint32_t * e,uint32_t * f,uint32_t * g,uint32_t * h)223 static inline void sha256_sw_round(
224 unsigned round_index, const uint32_t *schedule,
225 uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
226 uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
227 {
228 uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
229 sha256_round_constants[round_index] + schedule[round_index];
230
231 uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
232
233 *d += t1;
234 *h = t1 + t2;
235 }
236
sha256_sw_block(uint32_t * core,const uint8_t * block)237 static void sha256_sw_block(uint32_t *core, const uint8_t *block)
238 {
239 uint32_t w[SHA256_ROUNDS];
240 uint32_t a,b,c,d,e,f,g,h;
241
242 for (size_t t = 0; t < 16; t++)
243 w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
244
245 for (size_t t = 16; t < SHA256_ROUNDS; t++)
246 w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
247
248 a = core[0]; b = core[1]; c = core[2]; d = core[3];
249 e = core[4]; f = core[5]; g = core[6]; h = core[7];
250
251 for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
252 sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
253 sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
254 sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
255 sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
256 sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
257 sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
258 sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
259 sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
260 }
261
262 core[0] += a; core[1] += b; core[2] += c; core[3] += d;
263 core[4] += e; core[5] += f; core[6] += g; core[7] += h;
264
265 smemclr(w, sizeof(w));
266 }
267
268 typedef struct sha256_sw {
269 uint32_t core[8];
270 sha256_block blk;
271 BinarySink_IMPLEMENTATION;
272 ssh_hash hash;
273 } sha256_sw;
274
275 static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
276
sha256_sw_new(const ssh_hashalg * alg)277 static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
278 {
279 sha256_sw *s = snew(sha256_sw);
280
281 s->hash.vt = alg;
282 BinarySink_INIT(s, sha256_sw_write);
283 BinarySink_DELEGATE_INIT(&s->hash, s);
284 return &s->hash;
285 }
286
sha256_sw_reset(ssh_hash * hash)287 static void sha256_sw_reset(ssh_hash *hash)
288 {
289 sha256_sw *s = container_of(hash, sha256_sw, hash);
290
291 memcpy(s->core, sha256_initial_state, sizeof(s->core));
292 sha256_block_setup(&s->blk);
293 }
294
sha256_sw_copyfrom(ssh_hash * hcopy,ssh_hash * horig)295 static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
296 {
297 sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
298 sha256_sw *orig = container_of(horig, sha256_sw, hash);
299
300 memcpy(copy, orig, sizeof(*copy));
301 BinarySink_COPIED(copy);
302 BinarySink_DELEGATE_INIT(©->hash, copy);
303 }
304
sha256_sw_free(ssh_hash * hash)305 static void sha256_sw_free(ssh_hash *hash)
306 {
307 sha256_sw *s = container_of(hash, sha256_sw, hash);
308
309 smemclr(s, sizeof(*s));
310 sfree(s);
311 }
312
sha256_sw_write(BinarySink * bs,const void * vp,size_t len)313 static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
314 {
315 sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
316
317 while (len > 0)
318 if (sha256_block_write(&s->blk, &vp, &len))
319 sha256_sw_block(s->core, s->blk.block);
320 }
321
sha256_sw_digest(ssh_hash * hash,uint8_t * digest)322 static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
323 {
324 sha256_sw *s = container_of(hash, sha256_sw, hash);
325
326 sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
327 for (size_t i = 0; i < 8; i++)
328 PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
329 }
330
331 const ssh_hashalg ssh_sha256_sw = {
332 .new = sha256_sw_new,
333 .reset = sha256_sw_reset,
334 .copyfrom = sha256_sw_copyfrom,
335 .digest = sha256_sw_digest,
336 .free = sha256_sw_free,
337 .hlen = 32,
338 .blocklen = 64,
339 HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
340 };
341
342 /* ----------------------------------------------------------------------
343 * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
344 */
345
346 #if HW_SHA256 == HW_SHA256_NI
347
348 /*
349 * Set target architecture for Clang and GCC
350 */
351 #if defined(__clang__) || defined(__GNUC__)
352 # define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
353 #if !defined(__clang__)
354 # pragma GCC target("sha")
355 # pragma GCC target("sse4.1")
356 #endif
357 #else
358 # define FUNC_ISA
359 #endif
360
361 #include <wmmintrin.h>
362 #include <smmintrin.h>
363 #include <immintrin.h>
364 #if defined(__clang__) || defined(__GNUC__)
365 #include <shaintrin.h>
366 #endif
367
368 #if defined(__clang__) || defined(__GNUC__)
369 #include <cpuid.h>
370 #define GET_CPU_ID_0(out) \
371 __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
372 #define GET_CPU_ID_7(out) \
373 __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
374 #else
375 #define GET_CPU_ID_0(out) __cpuid(out, 0)
376 #define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
377 #endif
378
sha256_hw_available(void)379 static bool sha256_hw_available(void)
380 {
381 unsigned int CPUInfo[4];
382 GET_CPU_ID_0(CPUInfo);
383 if (CPUInfo[0] < 7)
384 return false;
385
386 GET_CPU_ID_7(CPUInfo);
387 return CPUInfo[1] & (1 << 29); /* Check SHA */
388 }
389
390 /* SHA256 implementation using new instructions
391 The code is based on Jeffrey Walton's SHA256 implementation:
392 https://github.com/noloader/SHA-Intrinsics
393 */
394 FUNC_ISA
sha256_ni_block(__m128i * core,const uint8_t * p)395 static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
396 {
397 __m128i STATE0, STATE1;
398 __m128i MSG, TMP;
399 __m128i MSG0, MSG1, MSG2, MSG3;
400 const __m128i *block = (const __m128i *)p;
401 const __m128i MASK = _mm_set_epi64x(
402 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
403
404 /* Load initial values */
405 STATE0 = core[0];
406 STATE1 = core[1];
407
408 /* Rounds 0-3 */
409 MSG = _mm_loadu_si128(block);
410 MSG0 = _mm_shuffle_epi8(MSG, MASK);
411 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
412 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
413 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
414 MSG = _mm_shuffle_epi32(MSG, 0x0E);
415 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
416
417 /* Rounds 4-7 */
418 MSG1 = _mm_loadu_si128(block + 1);
419 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
420 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
421 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
422 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
423 MSG = _mm_shuffle_epi32(MSG, 0x0E);
424 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
425 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
426
427 /* Rounds 8-11 */
428 MSG2 = _mm_loadu_si128(block + 2);
429 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
430 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
431 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
432 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
433 MSG = _mm_shuffle_epi32(MSG, 0x0E);
434 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
435 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
436
437 /* Rounds 12-15 */
438 MSG3 = _mm_loadu_si128(block + 3);
439 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
440 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
441 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
442 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
443 TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
444 MSG0 = _mm_add_epi32(MSG0, TMP);
445 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
446 MSG = _mm_shuffle_epi32(MSG, 0x0E);
447 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
448 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
449
450 /* Rounds 16-19 */
451 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
452 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
453 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
454 TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
455 MSG1 = _mm_add_epi32(MSG1, TMP);
456 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
457 MSG = _mm_shuffle_epi32(MSG, 0x0E);
458 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
459 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
460
461 /* Rounds 20-23 */
462 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
463 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
464 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
465 TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
466 MSG2 = _mm_add_epi32(MSG2, TMP);
467 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
468 MSG = _mm_shuffle_epi32(MSG, 0x0E);
469 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
471
472 /* Rounds 24-27 */
473 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
474 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
475 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
476 TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
477 MSG3 = _mm_add_epi32(MSG3, TMP);
478 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
479 MSG = _mm_shuffle_epi32(MSG, 0x0E);
480 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
481 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
482
483 /* Rounds 28-31 */
484 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
485 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
486 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
487 TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
488 MSG0 = _mm_add_epi32(MSG0, TMP);
489 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
490 MSG = _mm_shuffle_epi32(MSG, 0x0E);
491 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
492 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
493
494 /* Rounds 32-35 */
495 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
496 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
497 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
498 TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
499 MSG1 = _mm_add_epi32(MSG1, TMP);
500 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
501 MSG = _mm_shuffle_epi32(MSG, 0x0E);
502 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
503 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
504
505 /* Rounds 36-39 */
506 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
507 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
508 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
509 TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
510 MSG2 = _mm_add_epi32(MSG2, TMP);
511 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
512 MSG = _mm_shuffle_epi32(MSG, 0x0E);
513 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
514 MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
515
516 /* Rounds 40-43 */
517 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
518 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
519 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
520 TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
521 MSG3 = _mm_add_epi32(MSG3, TMP);
522 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
523 MSG = _mm_shuffle_epi32(MSG, 0x0E);
524 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
525 MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
526
527 /* Rounds 44-47 */
528 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
529 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
530 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
531 TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
532 MSG0 = _mm_add_epi32(MSG0, TMP);
533 MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
534 MSG = _mm_shuffle_epi32(MSG, 0x0E);
535 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
536 MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
537
538 /* Rounds 48-51 */
539 MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
540 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
541 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
542 TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
543 MSG1 = _mm_add_epi32(MSG1, TMP);
544 MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
545 MSG = _mm_shuffle_epi32(MSG, 0x0E);
546 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
547 MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
548
549 /* Rounds 52-55 */
550 MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
551 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
552 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
553 TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
554 MSG2 = _mm_add_epi32(MSG2, TMP);
555 MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
556 MSG = _mm_shuffle_epi32(MSG, 0x0E);
557 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
558
559 /* Rounds 56-59 */
560 MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
561 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
562 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
563 TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
564 MSG3 = _mm_add_epi32(MSG3, TMP);
565 MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
566 MSG = _mm_shuffle_epi32(MSG, 0x0E);
567 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
568
569 /* Rounds 60-63 */
570 MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
571 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
572 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
573 MSG = _mm_shuffle_epi32(MSG, 0x0E);
574 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
575
576 /* Combine state */
577 core[0] = _mm_add_epi32(STATE0, core[0]);
578 core[1] = _mm_add_epi32(STATE1, core[1]);
579 }
580
581 typedef struct sha256_ni {
582 /*
583 * These two vectors store the 8 words of the SHA-256 state, but
584 * not in the same order they appear in the spec: the first word
585 * holds A,B,E,F and the second word C,D,G,H.
586 */
587 __m128i core[2];
588 sha256_block blk;
589 void *pointer_to_free;
590 BinarySink_IMPLEMENTATION;
591 ssh_hash hash;
592 } sha256_ni;
593
594 static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
595
sha256_ni_alloc(void)596 static sha256_ni *sha256_ni_alloc(void)
597 {
598 /*
599 * The __m128i variables in the context structure need to be
600 * 16-byte aligned, but not all malloc implementations that this
601 * code has to work with will guarantee to return a 16-byte
602 * aligned pointer. So we over-allocate, manually realign the
603 * pointer ourselves, and store the original one inside the
604 * context so we know how to free it later.
605 */
606 void *allocation = smalloc(sizeof(sha256_ni) + 15);
607 uintptr_t alloc_address = (uintptr_t)allocation;
608 uintptr_t aligned_address = (alloc_address + 15) & ~15;
609 sha256_ni *s = (sha256_ni *)aligned_address;
610 s->pointer_to_free = allocation;
611 return s;
612 }
613
sha256_ni_new(const ssh_hashalg * alg)614 static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
615 {
616 if (!sha256_hw_available_cached())
617 return NULL;
618
619 sha256_ni *s = sha256_ni_alloc();
620
621 s->hash.vt = alg;
622 BinarySink_INIT(s, sha256_ni_write);
623 BinarySink_DELEGATE_INIT(&s->hash, s);
624
625 return &s->hash;
626 }
627
sha256_ni_reset(ssh_hash * hash)628 FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
629 {
630 sha256_ni *s = container_of(hash, sha256_ni, hash);
631
632 /* Initialise the core vectors in their storage order */
633 s->core[0] = _mm_set_epi64x(
634 0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
635 s->core[1] = _mm_set_epi64x(
636 0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
637
638 sha256_block_setup(&s->blk);
639 }
640
sha256_ni_copyfrom(ssh_hash * hcopy,ssh_hash * horig)641 static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
642 {
643 sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
644 sha256_ni *orig = container_of(horig, sha256_ni, hash);
645
646 void *ptf_save = copy->pointer_to_free;
647 *copy = *orig; /* structure copy */
648 copy->pointer_to_free = ptf_save;
649
650 BinarySink_COPIED(copy);
651 BinarySink_DELEGATE_INIT(©->hash, copy);
652 }
653
sha256_ni_free(ssh_hash * hash)654 static void sha256_ni_free(ssh_hash *hash)
655 {
656 sha256_ni *s = container_of(hash, sha256_ni, hash);
657
658 void *ptf = s->pointer_to_free;
659 smemclr(s, sizeof(*s));
660 sfree(ptf);
661 }
662
sha256_ni_write(BinarySink * bs,const void * vp,size_t len)663 static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
664 {
665 sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
666
667 while (len > 0)
668 if (sha256_block_write(&s->blk, &vp, &len))
669 sha256_ni_block(s->core, s->blk.block);
670 }
671
sha256_ni_digest(ssh_hash * hash,uint8_t * digest)672 FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
673 {
674 sha256_ni *s = container_of(hash, sha256_ni, hash);
675
676 sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
677
678 /* Rearrange the words into the output order */
679 __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
680 __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
681 __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
682 __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
683
684 /* Byte-swap them into the output endianness */
685 const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
686 dcba = _mm_shuffle_epi8(dcba, mask);
687 hgfe = _mm_shuffle_epi8(hgfe, mask);
688
689 /* And store them */
690 __m128i *output = (__m128i *)digest;
691 _mm_storeu_si128(output, dcba);
692 _mm_storeu_si128(output+1, hgfe);
693 }
694
695 const ssh_hashalg ssh_sha256_hw = {
696 .new = sha256_ni_new,
697 .reset = sha256_ni_reset,
698 .copyfrom = sha256_ni_copyfrom,
699 .digest = sha256_ni_digest,
700 .free = sha256_ni_free,
701 .hlen = 32,
702 .blocklen = 64,
703 HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
704 };
705
706 /* ----------------------------------------------------------------------
707 * Hardware-accelerated implementation of SHA-256 using Arm NEON.
708 */
709
710 #elif HW_SHA256 == HW_SHA256_NEON
711
712 /*
713 * Manually set the target architecture, if we decided above that we
714 * need to.
715 */
716 #ifdef USE_CLANG_ATTR_TARGET_AARCH64
717 /*
718 * A spot of cheating: redefine some ACLE feature macros before
719 * including arm_neon.h. Otherwise we won't get the SHA intrinsics
720 * defined by that header, because it will be looking at the settings
721 * for the whole translation unit rather than the ones we're going to
722 * put on some particular functions using __attribute__((target)).
723 */
724 #define __ARM_NEON 1
725 #define __ARM_FEATURE_CRYPTO 1
726 #define __ARM_FEATURE_SHA2 1
727 #define FUNC_ISA __attribute__ ((target("neon,crypto")))
728 #endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
729
730 #ifndef FUNC_ISA
731 #define FUNC_ISA
732 #endif
733
734 #ifdef USE_ARM64_NEON_H
735 #include <arm64_neon.h>
736 #else
737 #include <arm_neon.h>
738 #endif
739
sha256_hw_available(void)740 static bool sha256_hw_available(void)
741 {
742 /*
743 * For Arm, we delegate to a per-platform detection function (see
744 * explanation in sshaes.c).
745 */
746 return platform_sha256_hw_available();
747 }
748
749 typedef struct sha256_neon_core sha256_neon_core;
750 struct sha256_neon_core {
751 uint32x4_t abcd, efgh;
752 };
753
754 FUNC_ISA
sha256_neon_load_input(const uint8_t * p)755 static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
756 {
757 return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
758 }
759
760 FUNC_ISA
sha256_neon_schedule_update(uint32x4_t m4,uint32x4_t m3,uint32x4_t m2,uint32x4_t m1)761 static inline uint32x4_t sha256_neon_schedule_update(
762 uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
763 {
764 return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
765 }
766
767 FUNC_ISA
sha256_neon_round4(sha256_neon_core old,uint32x4_t sched,unsigned round)768 static inline sha256_neon_core sha256_neon_round4(
769 sha256_neon_core old, uint32x4_t sched, unsigned round)
770 {
771 sha256_neon_core new;
772
773 uint32x4_t round_input = vaddq_u32(
774 sched, vld1q_u32(sha256_round_constants + round));
775 new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
776 new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
777 return new;
778 }
779
780 FUNC_ISA
sha256_neon_block(sha256_neon_core * core,const uint8_t * p)781 static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
782 {
783 uint32x4_t s0, s1, s2, s3;
784 sha256_neon_core cr = *core;
785
786 s0 = sha256_neon_load_input(p);
787 cr = sha256_neon_round4(cr, s0, 0);
788 s1 = sha256_neon_load_input(p+16);
789 cr = sha256_neon_round4(cr, s1, 4);
790 s2 = sha256_neon_load_input(p+32);
791 cr = sha256_neon_round4(cr, s2, 8);
792 s3 = sha256_neon_load_input(p+48);
793 cr = sha256_neon_round4(cr, s3, 12);
794 s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
795 cr = sha256_neon_round4(cr, s0, 16);
796 s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
797 cr = sha256_neon_round4(cr, s1, 20);
798 s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
799 cr = sha256_neon_round4(cr, s2, 24);
800 s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
801 cr = sha256_neon_round4(cr, s3, 28);
802 s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
803 cr = sha256_neon_round4(cr, s0, 32);
804 s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
805 cr = sha256_neon_round4(cr, s1, 36);
806 s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
807 cr = sha256_neon_round4(cr, s2, 40);
808 s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
809 cr = sha256_neon_round4(cr, s3, 44);
810 s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
811 cr = sha256_neon_round4(cr, s0, 48);
812 s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
813 cr = sha256_neon_round4(cr, s1, 52);
814 s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
815 cr = sha256_neon_round4(cr, s2, 56);
816 s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
817 cr = sha256_neon_round4(cr, s3, 60);
818
819 core->abcd = vaddq_u32(core->abcd, cr.abcd);
820 core->efgh = vaddq_u32(core->efgh, cr.efgh);
821 }
822
823 typedef struct sha256_neon {
824 sha256_neon_core core;
825 sha256_block blk;
826 BinarySink_IMPLEMENTATION;
827 ssh_hash hash;
828 } sha256_neon;
829
830 static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
831
sha256_neon_new(const ssh_hashalg * alg)832 static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
833 {
834 if (!sha256_hw_available_cached())
835 return NULL;
836
837 sha256_neon *s = snew(sha256_neon);
838
839 s->hash.vt = alg;
840 BinarySink_INIT(s, sha256_neon_write);
841 BinarySink_DELEGATE_INIT(&s->hash, s);
842 return &s->hash;
843 }
844
sha256_neon_reset(ssh_hash * hash)845 static void sha256_neon_reset(ssh_hash *hash)
846 {
847 sha256_neon *s = container_of(hash, sha256_neon, hash);
848
849 s->core.abcd = vld1q_u32(sha256_initial_state);
850 s->core.efgh = vld1q_u32(sha256_initial_state + 4);
851
852 sha256_block_setup(&s->blk);
853 }
854
sha256_neon_copyfrom(ssh_hash * hcopy,ssh_hash * horig)855 static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
856 {
857 sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
858 sha256_neon *orig = container_of(horig, sha256_neon, hash);
859
860 *copy = *orig; /* structure copy */
861
862 BinarySink_COPIED(copy);
863 BinarySink_DELEGATE_INIT(©->hash, copy);
864 }
865
sha256_neon_free(ssh_hash * hash)866 static void sha256_neon_free(ssh_hash *hash)
867 {
868 sha256_neon *s = container_of(hash, sha256_neon, hash);
869 smemclr(s, sizeof(*s));
870 sfree(s);
871 }
872
sha256_neon_write(BinarySink * bs,const void * vp,size_t len)873 static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
874 {
875 sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
876
877 while (len > 0)
878 if (sha256_block_write(&s->blk, &vp, &len))
879 sha256_neon_block(&s->core, s->blk.block);
880 }
881
sha256_neon_digest(ssh_hash * hash,uint8_t * digest)882 static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
883 {
884 sha256_neon *s = container_of(hash, sha256_neon, hash);
885
886 sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
887 vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
888 vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
889 }
890
891 const ssh_hashalg ssh_sha256_hw = {
892 .new = sha256_neon_new,
893 .reset = sha256_neon_reset,
894 .copyfrom = sha256_neon_copyfrom,
895 .digest = sha256_neon_digest,
896 .free = sha256_neon_free,
897 .hlen = 32,
898 .blocklen = 64,
899 HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
900 };
901
902 /* ----------------------------------------------------------------------
903 * Stub functions if we have no hardware-accelerated SHA-256. In this
904 * case, sha256_hw_new returns NULL (though it should also never be
905 * selected by sha256_select, so the only thing that should even be
906 * _able_ to call it is testcrypt). As a result, the remaining vtable
907 * functions should never be called at all.
908 */
909
910 #elif HW_SHA256 == HW_SHA256_NONE
911
sha256_hw_available(void)912 static bool sha256_hw_available(void)
913 {
914 return false;
915 }
916
sha256_stub_new(const ssh_hashalg * alg)917 static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
918 {
919 return NULL;
920 }
921
922 #define STUB_BODY { unreachable("Should never be called"); }
923
924 static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
925 static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
926 static void sha256_stub_free(ssh_hash *hash) STUB_BODY
927 static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
928
929 const ssh_hashalg ssh_sha256_hw = {
930 .new = sha256_stub_new,
931 .reset = sha256_stub_reset,
932 .copyfrom = sha256_stub_copyfrom,
933 .digest = sha256_stub_digest,
934 .free = sha256_stub_free,
935 .hlen = 32,
936 .blocklen = 64,
937 HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
938 };
939
940 #endif /* HW_SHA256 */
941