1 /*
2  * SHA-256 algorithm as described at
3  *
4  *   http://csrc.nist.gov/cryptval/shs.html
5  */
6 
7 #include "ssh.h"
8 #include <assert.h>
9 
10 /*
11  * Start by deciding whether we can support hardware SHA at all.
12  */
13 #define HW_SHA256_NONE 0
14 #define HW_SHA256_NI 1
15 #define HW_SHA256_NEON 2
16 
17 #ifdef _FORCE_SHA_NI
18 #   define HW_SHA256 HW_SHA256_NI
19 #elif defined(__clang__)
20 #   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
21     (defined(__x86_64__) || defined(__i386))
22 #       define HW_SHA256 HW_SHA256_NI
23 #   endif
24 #elif defined(__GNUC__)
25 #    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \
26         (defined(__x86_64__) || defined(__i386))
27 #       define HW_SHA256 HW_SHA256_NI
28 #    endif
29 #elif defined (_MSC_VER)
30 #   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
31 #      define HW_SHA256 HW_SHA256_NI
32 #   endif
33 #endif
34 
35 #ifdef _FORCE_SHA_NEON
36 #   define HW_SHA256 HW_SHA256_NEON
37 #elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
38     /* Arm can potentially support both endiannesses, but this code
39      * hasn't been tested on anything but little. If anyone wants to
40      * run big-endian, they'll need to fix it first. */
41 #elif defined __ARM_FEATURE_CRYPTO
42     /* If the Arm crypto extension is available already, we can
43      * support NEON SHA without having to enable anything by hand */
44 #   define HW_SHA256 HW_SHA256_NEON
45 #elif defined(__clang__)
46 #   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
47     (defined(__aarch64__))
48         /* clang can enable the crypto extension in AArch64 using
49          * __attribute__((target)) */
50 #       define HW_SHA256 HW_SHA256_NEON
51 #       define USE_CLANG_ATTR_TARGET_AARCH64
52 #   endif
53 #elif defined _MSC_VER
54     /* Visual Studio supports the crypto extension when targeting
55      * AArch64, but as of VS2017, the AArch32 header doesn't quite
56      * manage it (declaring the shae/shad intrinsics without a round
57      * key operand). */
58 #   if defined _M_ARM64
59 #       define HW_SHA256 HW_SHA256_NEON
60 #       if defined _M_ARM64
61 #           define USE_ARM64_NEON_H /* unusual header name in this case */
62 #       endif
63 #   endif
64 #endif
65 
66 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
67 #   undef HW_SHA256
68 #   define HW_SHA256 HW_SHA256_NONE
69 #endif
70 
71 /*
72  * The actual query function that asks if hardware acceleration is
73  * available.
74  */
75 static bool sha256_hw_available(void);
76 
77 /*
78  * The top-level selection function, caching the results of
79  * sha256_hw_available() so it only has to run once.
80  */
sha256_hw_available_cached(void)81 static bool sha256_hw_available_cached(void)
82 {
83     static bool initialised = false;
84     static bool hw_available;
85     if (!initialised) {
86         hw_available = sha256_hw_available();
87         initialised = true;
88     }
89     return hw_available;
90 }
91 
sha256_select(const ssh_hashalg * alg)92 static ssh_hash *sha256_select(const ssh_hashalg *alg)
93 {
94     const ssh_hashalg *real_alg =
95         sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
96 
97     return ssh_hash_new(real_alg);
98 }
99 
100 const ssh_hashalg ssh_sha256 = {
101     .new = sha256_select,
102     .hlen = 32,
103     .blocklen = 64,
104     HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
105 };
106 
107 /* ----------------------------------------------------------------------
108  * Definitions likely to be helpful to multiple implementations.
109  */
110 
111 static const uint32_t sha256_initial_state[] = {
112     0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
113     0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
114 };
115 
116 static const uint32_t sha256_round_constants[] = {
117     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
118     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
119     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
120     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
121     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
122     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
123     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
124     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
125     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
126     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
127     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
128     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
129     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
130     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
131     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
132     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
133 };
134 
135 #define SHA256_ROUNDS 64
136 
137 typedef struct sha256_block sha256_block;
138 struct sha256_block {
139     uint8_t block[64];
140     size_t used;
141     uint64_t len;
142 };
143 
sha256_block_setup(sha256_block * blk)144 static inline void sha256_block_setup(sha256_block *blk)
145 {
146     blk->used = 0;
147     blk->len = 0;
148 }
149 
sha256_block_write(sha256_block * blk,const void ** vdata,size_t * len)150 static inline bool sha256_block_write(
151     sha256_block *blk, const void **vdata, size_t *len)
152 {
153     size_t blkleft = sizeof(blk->block) - blk->used;
154     size_t chunk = *len < blkleft ? *len : blkleft;
155 
156     const uint8_t *p = *vdata;
157     memcpy(blk->block + blk->used, p, chunk);
158     *vdata = p + chunk;
159     *len -= chunk;
160     blk->used += chunk;
161     blk->len += chunk;
162 
163     if (blk->used == sizeof(blk->block)) {
164         blk->used = 0;
165         return true;
166     }
167 
168     return false;
169 }
170 
sha256_block_pad(sha256_block * blk,BinarySink * bs)171 static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
172 {
173     uint64_t final_len = blk->len << 3;
174     size_t pad = 1 + (63 & (55 - blk->used));
175 
176     put_byte(bs, 0x80);
177     for (size_t i = 1; i < pad; i++)
178         put_byte(bs, 0);
179     put_uint64(bs, final_len);
180 
181     assert(blk->used == 0 && "Should have exactly hit a block boundary");
182 }
183 
184 /* ----------------------------------------------------------------------
185  * Software implementation of SHA-256.
186  */
187 
ror(uint32_t x,unsigned y)188 static inline uint32_t ror(uint32_t x, unsigned y)
189 {
190     return (x << (31 & -y)) | (x >> (31 & y));
191 }
192 
Ch(uint32_t ctrl,uint32_t if1,uint32_t if0)193 static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
194 {
195     return if0 ^ (ctrl & (if1 ^ if0));
196 }
197 
Maj(uint32_t x,uint32_t y,uint32_t z)198 static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
199 {
200     return (x & y) | (z & (x | y));
201 }
202 
Sigma_0(uint32_t x)203 static inline uint32_t Sigma_0(uint32_t x)
204 {
205     return ror(x,2) ^ ror(x,13) ^ ror(x,22);
206 }
207 
Sigma_1(uint32_t x)208 static inline uint32_t Sigma_1(uint32_t x)
209 {
210     return ror(x,6) ^ ror(x,11) ^ ror(x,25);
211 }
212 
sigma_0(uint32_t x)213 static inline uint32_t sigma_0(uint32_t x)
214 {
215     return ror(x,7) ^ ror(x,18) ^ (x >> 3);
216 }
217 
sigma_1(uint32_t x)218 static inline uint32_t sigma_1(uint32_t x)
219 {
220     return ror(x,17) ^ ror(x,19) ^ (x >> 10);
221 }
222 
sha256_sw_round(unsigned round_index,const uint32_t * schedule,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d,uint32_t * e,uint32_t * f,uint32_t * g,uint32_t * h)223 static inline void sha256_sw_round(
224     unsigned round_index, const uint32_t *schedule,
225     uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
226     uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
227 {
228     uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
229         sha256_round_constants[round_index] + schedule[round_index];
230 
231     uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
232 
233     *d += t1;
234     *h = t1 + t2;
235 }
236 
sha256_sw_block(uint32_t * core,const uint8_t * block)237 static void sha256_sw_block(uint32_t *core, const uint8_t *block)
238 {
239     uint32_t w[SHA256_ROUNDS];
240     uint32_t a,b,c,d,e,f,g,h;
241 
242     for (size_t t = 0; t < 16; t++)
243         w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
244 
245     for (size_t t = 16; t < SHA256_ROUNDS; t++)
246         w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
247 
248     a = core[0]; b = core[1]; c = core[2]; d = core[3];
249     e = core[4]; f = core[5]; g = core[6]; h = core[7];
250 
251     for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
252         sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
253         sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
254         sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
255         sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
256         sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
257         sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
258         sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
259         sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
260     }
261 
262     core[0] += a; core[1] += b; core[2] += c; core[3] += d;
263     core[4] += e; core[5] += f; core[6] += g; core[7] += h;
264 
265     smemclr(w, sizeof(w));
266 }
267 
268 typedef struct sha256_sw {
269     uint32_t core[8];
270     sha256_block blk;
271     BinarySink_IMPLEMENTATION;
272     ssh_hash hash;
273 } sha256_sw;
274 
275 static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
276 
sha256_sw_new(const ssh_hashalg * alg)277 static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
278 {
279     sha256_sw *s = snew(sha256_sw);
280 
281     s->hash.vt = alg;
282     BinarySink_INIT(s, sha256_sw_write);
283     BinarySink_DELEGATE_INIT(&s->hash, s);
284     return &s->hash;
285 }
286 
sha256_sw_reset(ssh_hash * hash)287 static void sha256_sw_reset(ssh_hash *hash)
288 {
289     sha256_sw *s = container_of(hash, sha256_sw, hash);
290 
291     memcpy(s->core, sha256_initial_state, sizeof(s->core));
292     sha256_block_setup(&s->blk);
293 }
294 
sha256_sw_copyfrom(ssh_hash * hcopy,ssh_hash * horig)295 static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
296 {
297     sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
298     sha256_sw *orig = container_of(horig, sha256_sw, hash);
299 
300     memcpy(copy, orig, sizeof(*copy));
301     BinarySink_COPIED(copy);
302     BinarySink_DELEGATE_INIT(&copy->hash, copy);
303 }
304 
sha256_sw_free(ssh_hash * hash)305 static void sha256_sw_free(ssh_hash *hash)
306 {
307     sha256_sw *s = container_of(hash, sha256_sw, hash);
308 
309     smemclr(s, sizeof(*s));
310     sfree(s);
311 }
312 
sha256_sw_write(BinarySink * bs,const void * vp,size_t len)313 static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
314 {
315     sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
316 
317     while (len > 0)
318         if (sha256_block_write(&s->blk, &vp, &len))
319             sha256_sw_block(s->core, s->blk.block);
320 }
321 
sha256_sw_digest(ssh_hash * hash,uint8_t * digest)322 static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
323 {
324     sha256_sw *s = container_of(hash, sha256_sw, hash);
325 
326     sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
327     for (size_t i = 0; i < 8; i++)
328         PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
329 }
330 
331 const ssh_hashalg ssh_sha256_sw = {
332     .new = sha256_sw_new,
333     .reset = sha256_sw_reset,
334     .copyfrom = sha256_sw_copyfrom,
335     .digest = sha256_sw_digest,
336     .free = sha256_sw_free,
337     .hlen = 32,
338     .blocklen = 64,
339     HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
340 };
341 
342 /* ----------------------------------------------------------------------
343  * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
344  */
345 
346 #if HW_SHA256 == HW_SHA256_NI
347 
348 /*
349  * Set target architecture for Clang and GCC
350  */
351 #if defined(__clang__) || defined(__GNUC__)
352 #    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
353 #if !defined(__clang__)
354 #    pragma GCC target("sha")
355 #    pragma GCC target("sse4.1")
356 #endif
357 #else
358 #    define FUNC_ISA
359 #endif
360 
361 #include <wmmintrin.h>
362 #include <smmintrin.h>
363 #include <immintrin.h>
364 #if defined(__clang__) || defined(__GNUC__)
365 #include <shaintrin.h>
366 #endif
367 
368 #if defined(__clang__) || defined(__GNUC__)
369 #include <cpuid.h>
370 #define GET_CPU_ID_0(out)                               \
371     __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
372 #define GET_CPU_ID_7(out)                                       \
373     __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
374 #else
375 #define GET_CPU_ID_0(out) __cpuid(out, 0)
376 #define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
377 #endif
378 
sha256_hw_available(void)379 static bool sha256_hw_available(void)
380 {
381     unsigned int CPUInfo[4];
382     GET_CPU_ID_0(CPUInfo);
383     if (CPUInfo[0] < 7)
384         return false;
385 
386     GET_CPU_ID_7(CPUInfo);
387     return CPUInfo[1] & (1 << 29); /* Check SHA */
388 }
389 
390 /* SHA256 implementation using new instructions
391    The code is based on Jeffrey Walton's SHA256 implementation:
392    https://github.com/noloader/SHA-Intrinsics
393 */
394 FUNC_ISA
sha256_ni_block(__m128i * core,const uint8_t * p)395 static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
396 {
397     __m128i STATE0, STATE1;
398     __m128i MSG, TMP;
399     __m128i MSG0, MSG1, MSG2, MSG3;
400     const __m128i *block = (const __m128i *)p;
401     const __m128i MASK = _mm_set_epi64x(
402         0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
403 
404     /* Load initial values */
405     STATE0 = core[0];
406     STATE1 = core[1];
407 
408     /* Rounds 0-3 */
409     MSG = _mm_loadu_si128(block);
410     MSG0 = _mm_shuffle_epi8(MSG, MASK);
411     MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
412                             0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
413     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
414     MSG = _mm_shuffle_epi32(MSG, 0x0E);
415     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
416 
417     /* Rounds 4-7 */
418     MSG1 = _mm_loadu_si128(block + 1);
419     MSG1 = _mm_shuffle_epi8(MSG1, MASK);
420     MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
421                             0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
422     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
423     MSG = _mm_shuffle_epi32(MSG, 0x0E);
424     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
425     MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
426 
427     /* Rounds 8-11 */
428     MSG2 = _mm_loadu_si128(block + 2);
429     MSG2 = _mm_shuffle_epi8(MSG2, MASK);
430     MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
431                             0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
432     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
433     MSG = _mm_shuffle_epi32(MSG, 0x0E);
434     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
435     MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
436 
437     /* Rounds 12-15 */
438     MSG3 = _mm_loadu_si128(block + 3);
439     MSG3 = _mm_shuffle_epi8(MSG3, MASK);
440     MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
441                             0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
442     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
443     TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
444     MSG0 = _mm_add_epi32(MSG0, TMP);
445     MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
446     MSG = _mm_shuffle_epi32(MSG, 0x0E);
447     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
448     MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
449 
450     /* Rounds 16-19 */
451     MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
452                             0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
453     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
454     TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
455     MSG1 = _mm_add_epi32(MSG1, TMP);
456     MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
457     MSG = _mm_shuffle_epi32(MSG, 0x0E);
458     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
459     MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
460 
461     /* Rounds 20-23 */
462     MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
463                             0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
464     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
465     TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
466     MSG2 = _mm_add_epi32(MSG2, TMP);
467     MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
468     MSG = _mm_shuffle_epi32(MSG, 0x0E);
469     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470     MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
471 
472     /* Rounds 24-27 */
473     MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
474                             0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
475     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
476     TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
477     MSG3 = _mm_add_epi32(MSG3, TMP);
478     MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
479     MSG = _mm_shuffle_epi32(MSG, 0x0E);
480     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
481     MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
482 
483     /* Rounds 28-31 */
484     MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
485                             0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
486     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
487     TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
488     MSG0 = _mm_add_epi32(MSG0, TMP);
489     MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
490     MSG = _mm_shuffle_epi32(MSG, 0x0E);
491     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
492     MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
493 
494     /* Rounds 32-35 */
495     MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
496                             0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
497     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
498     TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
499     MSG1 = _mm_add_epi32(MSG1, TMP);
500     MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
501     MSG = _mm_shuffle_epi32(MSG, 0x0E);
502     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
503     MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
504 
505     /* Rounds 36-39 */
506     MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
507                             0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
508     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
509     TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
510     MSG2 = _mm_add_epi32(MSG2, TMP);
511     MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
512     MSG = _mm_shuffle_epi32(MSG, 0x0E);
513     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
514     MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
515 
516     /* Rounds 40-43 */
517     MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
518                             0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
519     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
520     TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
521     MSG3 = _mm_add_epi32(MSG3, TMP);
522     MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
523     MSG = _mm_shuffle_epi32(MSG, 0x0E);
524     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
525     MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
526 
527     /* Rounds 44-47 */
528     MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
529                             0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
530     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
531     TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
532     MSG0 = _mm_add_epi32(MSG0, TMP);
533     MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
534     MSG = _mm_shuffle_epi32(MSG, 0x0E);
535     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
536     MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
537 
538     /* Rounds 48-51 */
539     MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
540                             0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
541     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
542     TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
543     MSG1 = _mm_add_epi32(MSG1, TMP);
544     MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
545     MSG = _mm_shuffle_epi32(MSG, 0x0E);
546     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
547     MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
548 
549     /* Rounds 52-55 */
550     MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
551                             0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
552     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
553     TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
554     MSG2 = _mm_add_epi32(MSG2, TMP);
555     MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
556     MSG = _mm_shuffle_epi32(MSG, 0x0E);
557     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
558 
559     /* Rounds 56-59 */
560     MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
561                             0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
562     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
563     TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
564     MSG3 = _mm_add_epi32(MSG3, TMP);
565     MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
566     MSG = _mm_shuffle_epi32(MSG, 0x0E);
567     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
568 
569     /* Rounds 60-63 */
570     MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
571                             0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
572     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
573     MSG = _mm_shuffle_epi32(MSG, 0x0E);
574     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
575 
576     /* Combine state */
577     core[0] = _mm_add_epi32(STATE0, core[0]);
578     core[1] = _mm_add_epi32(STATE1, core[1]);
579 }
580 
581 typedef struct sha256_ni {
582     /*
583      * These two vectors store the 8 words of the SHA-256 state, but
584      * not in the same order they appear in the spec: the first word
585      * holds A,B,E,F and the second word C,D,G,H.
586      */
587     __m128i core[2];
588     sha256_block blk;
589     void *pointer_to_free;
590     BinarySink_IMPLEMENTATION;
591     ssh_hash hash;
592 } sha256_ni;
593 
594 static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
595 
sha256_ni_alloc(void)596 static sha256_ni *sha256_ni_alloc(void)
597 {
598     /*
599      * The __m128i variables in the context structure need to be
600      * 16-byte aligned, but not all malloc implementations that this
601      * code has to work with will guarantee to return a 16-byte
602      * aligned pointer. So we over-allocate, manually realign the
603      * pointer ourselves, and store the original one inside the
604      * context so we know how to free it later.
605      */
606     void *allocation = smalloc(sizeof(sha256_ni) + 15);
607     uintptr_t alloc_address = (uintptr_t)allocation;
608     uintptr_t aligned_address = (alloc_address + 15) & ~15;
609     sha256_ni *s = (sha256_ni *)aligned_address;
610     s->pointer_to_free = allocation;
611     return s;
612 }
613 
sha256_ni_new(const ssh_hashalg * alg)614 static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
615 {
616     if (!sha256_hw_available_cached())
617         return NULL;
618 
619     sha256_ni *s = sha256_ni_alloc();
620 
621     s->hash.vt = alg;
622     BinarySink_INIT(s, sha256_ni_write);
623     BinarySink_DELEGATE_INIT(&s->hash, s);
624 
625     return &s->hash;
626 }
627 
sha256_ni_reset(ssh_hash * hash)628 FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
629 {
630     sha256_ni *s = container_of(hash, sha256_ni, hash);
631 
632     /* Initialise the core vectors in their storage order */
633     s->core[0] = _mm_set_epi64x(
634         0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
635     s->core[1] = _mm_set_epi64x(
636         0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
637 
638     sha256_block_setup(&s->blk);
639 }
640 
sha256_ni_copyfrom(ssh_hash * hcopy,ssh_hash * horig)641 static void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
642 {
643     sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
644     sha256_ni *orig = container_of(horig, sha256_ni, hash);
645 
646     void *ptf_save = copy->pointer_to_free;
647     *copy = *orig; /* structure copy */
648     copy->pointer_to_free = ptf_save;
649 
650     BinarySink_COPIED(copy);
651     BinarySink_DELEGATE_INIT(&copy->hash, copy);
652 }
653 
sha256_ni_free(ssh_hash * hash)654 static void sha256_ni_free(ssh_hash *hash)
655 {
656     sha256_ni *s = container_of(hash, sha256_ni, hash);
657 
658     void *ptf = s->pointer_to_free;
659     smemclr(s, sizeof(*s));
660     sfree(ptf);
661 }
662 
sha256_ni_write(BinarySink * bs,const void * vp,size_t len)663 static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
664 {
665     sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
666 
667     while (len > 0)
668         if (sha256_block_write(&s->blk, &vp, &len))
669             sha256_ni_block(s->core, s->blk.block);
670 }
671 
sha256_ni_digest(ssh_hash * hash,uint8_t * digest)672 FUNC_ISA static void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
673 {
674     sha256_ni *s = container_of(hash, sha256_ni, hash);
675 
676     sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
677 
678     /* Rearrange the words into the output order */
679     __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
680     __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
681     __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
682     __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
683 
684     /* Byte-swap them into the output endianness */
685     const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
686     dcba = _mm_shuffle_epi8(dcba, mask);
687     hgfe = _mm_shuffle_epi8(hgfe, mask);
688 
689     /* And store them */
690     __m128i *output = (__m128i *)digest;
691     _mm_storeu_si128(output, dcba);
692     _mm_storeu_si128(output+1, hgfe);
693 }
694 
695 const ssh_hashalg ssh_sha256_hw = {
696     .new = sha256_ni_new,
697     .reset = sha256_ni_reset,
698     .copyfrom = sha256_ni_copyfrom,
699     .digest = sha256_ni_digest,
700     .free = sha256_ni_free,
701     .hlen = 32,
702     .blocklen = 64,
703     HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
704 };
705 
706 /* ----------------------------------------------------------------------
707  * Hardware-accelerated implementation of SHA-256 using Arm NEON.
708  */
709 
710 #elif HW_SHA256 == HW_SHA256_NEON
711 
712 /*
713  * Manually set the target architecture, if we decided above that we
714  * need to.
715  */
716 #ifdef USE_CLANG_ATTR_TARGET_AARCH64
717 /*
718  * A spot of cheating: redefine some ACLE feature macros before
719  * including arm_neon.h. Otherwise we won't get the SHA intrinsics
720  * defined by that header, because it will be looking at the settings
721  * for the whole translation unit rather than the ones we're going to
722  * put on some particular functions using __attribute__((target)).
723  */
724 #define __ARM_NEON 1
725 #define __ARM_FEATURE_CRYPTO 1
726 #define __ARM_FEATURE_SHA2 1
727 #define FUNC_ISA __attribute__ ((target("neon,crypto")))
728 #endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
729 
730 #ifndef FUNC_ISA
731 #define FUNC_ISA
732 #endif
733 
734 #ifdef USE_ARM64_NEON_H
735 #include <arm64_neon.h>
736 #else
737 #include <arm_neon.h>
738 #endif
739 
sha256_hw_available(void)740 static bool sha256_hw_available(void)
741 {
742     /*
743      * For Arm, we delegate to a per-platform detection function (see
744      * explanation in sshaes.c).
745      */
746     return platform_sha256_hw_available();
747 }
748 
749 typedef struct sha256_neon_core sha256_neon_core;
750 struct sha256_neon_core {
751     uint32x4_t abcd, efgh;
752 };
753 
754 FUNC_ISA
sha256_neon_load_input(const uint8_t * p)755 static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
756 {
757     return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
758 }
759 
760 FUNC_ISA
sha256_neon_schedule_update(uint32x4_t m4,uint32x4_t m3,uint32x4_t m2,uint32x4_t m1)761 static inline uint32x4_t sha256_neon_schedule_update(
762     uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
763 {
764     return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
765 }
766 
767 FUNC_ISA
sha256_neon_round4(sha256_neon_core old,uint32x4_t sched,unsigned round)768 static inline sha256_neon_core sha256_neon_round4(
769     sha256_neon_core old, uint32x4_t sched, unsigned round)
770 {
771     sha256_neon_core new;
772 
773     uint32x4_t round_input = vaddq_u32(
774         sched, vld1q_u32(sha256_round_constants + round));
775     new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
776     new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
777     return new;
778 }
779 
780 FUNC_ISA
sha256_neon_block(sha256_neon_core * core,const uint8_t * p)781 static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
782 {
783     uint32x4_t s0, s1, s2, s3;
784     sha256_neon_core cr = *core;
785 
786     s0 = sha256_neon_load_input(p);
787     cr = sha256_neon_round4(cr, s0, 0);
788     s1 = sha256_neon_load_input(p+16);
789     cr = sha256_neon_round4(cr, s1, 4);
790     s2 = sha256_neon_load_input(p+32);
791     cr = sha256_neon_round4(cr, s2, 8);
792     s3 = sha256_neon_load_input(p+48);
793     cr = sha256_neon_round4(cr, s3, 12);
794     s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
795     cr = sha256_neon_round4(cr, s0, 16);
796     s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
797     cr = sha256_neon_round4(cr, s1, 20);
798     s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
799     cr = sha256_neon_round4(cr, s2, 24);
800     s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
801     cr = sha256_neon_round4(cr, s3, 28);
802     s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
803     cr = sha256_neon_round4(cr, s0, 32);
804     s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
805     cr = sha256_neon_round4(cr, s1, 36);
806     s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
807     cr = sha256_neon_round4(cr, s2, 40);
808     s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
809     cr = sha256_neon_round4(cr, s3, 44);
810     s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
811     cr = sha256_neon_round4(cr, s0, 48);
812     s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
813     cr = sha256_neon_round4(cr, s1, 52);
814     s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
815     cr = sha256_neon_round4(cr, s2, 56);
816     s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
817     cr = sha256_neon_round4(cr, s3, 60);
818 
819     core->abcd = vaddq_u32(core->abcd, cr.abcd);
820     core->efgh = vaddq_u32(core->efgh, cr.efgh);
821 }
822 
823 typedef struct sha256_neon {
824     sha256_neon_core core;
825     sha256_block blk;
826     BinarySink_IMPLEMENTATION;
827     ssh_hash hash;
828 } sha256_neon;
829 
830 static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
831 
sha256_neon_new(const ssh_hashalg * alg)832 static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
833 {
834     if (!sha256_hw_available_cached())
835         return NULL;
836 
837     sha256_neon *s = snew(sha256_neon);
838 
839     s->hash.vt = alg;
840     BinarySink_INIT(s, sha256_neon_write);
841     BinarySink_DELEGATE_INIT(&s->hash, s);
842     return &s->hash;
843 }
844 
sha256_neon_reset(ssh_hash * hash)845 static void sha256_neon_reset(ssh_hash *hash)
846 {
847     sha256_neon *s = container_of(hash, sha256_neon, hash);
848 
849     s->core.abcd = vld1q_u32(sha256_initial_state);
850     s->core.efgh = vld1q_u32(sha256_initial_state + 4);
851 
852     sha256_block_setup(&s->blk);
853 }
854 
sha256_neon_copyfrom(ssh_hash * hcopy,ssh_hash * horig)855 static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
856 {
857     sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
858     sha256_neon *orig = container_of(horig, sha256_neon, hash);
859 
860     *copy = *orig; /* structure copy */
861 
862     BinarySink_COPIED(copy);
863     BinarySink_DELEGATE_INIT(&copy->hash, copy);
864 }
865 
sha256_neon_free(ssh_hash * hash)866 static void sha256_neon_free(ssh_hash *hash)
867 {
868     sha256_neon *s = container_of(hash, sha256_neon, hash);
869     smemclr(s, sizeof(*s));
870     sfree(s);
871 }
872 
sha256_neon_write(BinarySink * bs,const void * vp,size_t len)873 static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
874 {
875     sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
876 
877     while (len > 0)
878         if (sha256_block_write(&s->blk, &vp, &len))
879             sha256_neon_block(&s->core, s->blk.block);
880 }
881 
sha256_neon_digest(ssh_hash * hash,uint8_t * digest)882 static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
883 {
884     sha256_neon *s = container_of(hash, sha256_neon, hash);
885 
886     sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
887     vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
888     vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
889 }
890 
891 const ssh_hashalg ssh_sha256_hw = {
892     .new = sha256_neon_new,
893     .reset = sha256_neon_reset,
894     .copyfrom = sha256_neon_copyfrom,
895     .digest = sha256_neon_digest,
896     .free = sha256_neon_free,
897     .hlen = 32,
898     .blocklen = 64,
899     HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
900 };
901 
902 /* ----------------------------------------------------------------------
903  * Stub functions if we have no hardware-accelerated SHA-256. In this
904  * case, sha256_hw_new returns NULL (though it should also never be
905  * selected by sha256_select, so the only thing that should even be
906  * _able_ to call it is testcrypt). As a result, the remaining vtable
907  * functions should never be called at all.
908  */
909 
910 #elif HW_SHA256 == HW_SHA256_NONE
911 
sha256_hw_available(void)912 static bool sha256_hw_available(void)
913 {
914     return false;
915 }
916 
sha256_stub_new(const ssh_hashalg * alg)917 static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
918 {
919     return NULL;
920 }
921 
922 #define STUB_BODY { unreachable("Should never be called"); }
923 
924 static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
925 static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
926 static void sha256_stub_free(ssh_hash *hash) STUB_BODY
927 static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
928 
929 const ssh_hashalg ssh_sha256_hw = {
930     .new = sha256_stub_new,
931     .reset = sha256_stub_reset,
932     .copyfrom = sha256_stub_copyfrom,
933     .digest = sha256_stub_digest,
934     .free = sha256_stub_free,
935     .hlen = 32,
936     .blocklen = 64,
937     HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
938 };
939 
940 #endif /* HW_SHA256 */
941