1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 #ifdef USE_HW_SHA2
6 
7 #include <immintrin.h>
8 
9 #ifdef FREEBL_NO_DEPEND
10 #include "stubs.h"
11 #endif
12 
13 #include "blapii.h"
14 #include "prcpucfg.h"
15 #include "prtypes.h" /* for PRUintXX */
16 #include "prlong.h"
17 #include "blapi.h"
18 #include "sha256.h"
19 
20 /* SHA-256 constants, K256. */
21 pre_align static const PRUint32 K256[64] post_align = {
22     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26     0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27     0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29     0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30     0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32     0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33     0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35     0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36     0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38 };
39 
40 #define ROUND(n, a, b, c, d)                                \
41     {                                                       \
42         __m128i t = _mm_add_epi32(a, k##n);                 \
43         w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
44         t = _mm_shuffle_epi32(t, 0x0e);                     \
45         w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
46         if (n < 12) {                                       \
47             a = _mm_sha256msg1_epu32(a, b);                 \
48             a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49             a = _mm_sha256msg2_epu32(a, d);                 \
50         }                                                   \
51     }
52 
53 void
SHA256_Compress_Native(SHA256Context * ctx)54 SHA256_Compress_Native(SHA256Context *ctx)
55 {
56     __m128i h0, h1, th;
57     __m128i a, b, c, d;
58     __m128i w0, w1;
59     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60 
61     const __m128i *K = (__m128i *)K256;
62     const __m128i k0 = _mm_load_si128(K);
63     const __m128i k1 = _mm_load_si128(K + 1);
64     const __m128i k2 = _mm_load_si128(K + 2);
65     const __m128i k3 = _mm_load_si128(K + 3);
66     const __m128i k4 = _mm_load_si128(K + 4);
67     const __m128i k5 = _mm_load_si128(K + 5);
68     const __m128i k6 = _mm_load_si128(K + 6);
69     const __m128i k7 = _mm_load_si128(K + 7);
70     const __m128i k8 = _mm_load_si128(K + 8);
71     const __m128i k9 = _mm_load_si128(K + 9);
72     const __m128i k10 = _mm_load_si128(K + 10);
73     const __m128i k11 = _mm_load_si128(K + 11);
74     const __m128i k12 = _mm_load_si128(K + 12);
75     const __m128i k13 = _mm_load_si128(K + 13);
76     const __m128i k14 = _mm_load_si128(K + 14);
77     const __m128i k15 = _mm_load_si128(K + 15);
78 
79     const __m128i *input = (__m128i *)ctx->u.b;
80 
81     h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83 
84     /* H0123:4567 -> H01256:H2367 */
85     th = _mm_shuffle_epi32(h0, 0xb1);
86     h1 = _mm_shuffle_epi32(h1, 0x1b);
87     h0 = _mm_alignr_epi8(th, h1, 8);
88     h1 = _mm_blend_epi16(h1, th, 0xf0);
89 
90     a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91     b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92     c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93     d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94 
95     w0 = h0;
96     w1 = h1;
97 
98     ROUND(0, a, b, c, d)
99     ROUND(1, b, c, d, a)
100     ROUND(2, c, d, a, b)
101     ROUND(3, d, a, b, c)
102     ROUND(4, a, b, c, d)
103     ROUND(5, b, c, d, a)
104     ROUND(6, c, d, a, b)
105     ROUND(7, d, a, b, c)
106     ROUND(8, a, b, c, d)
107     ROUND(9, b, c, d, a)
108     ROUND(10, c, d, a, b)
109     ROUND(11, d, a, b, c)
110     ROUND(12, a, b, c, d)
111     ROUND(13, b, c, d, a)
112     ROUND(14, c, d, a, b)
113     ROUND(15, d, a, b, c)
114 
115     h0 = _mm_add_epi32(h0, w0);
116     h1 = _mm_add_epi32(h1, w1);
117 
118     /* H0145:2367 -> H0123:4567 */
119     th = _mm_shuffle_epi32(h0, 0x1b);
120     h1 = _mm_shuffle_epi32(h1, 0xb1);
121     h0 = _mm_blend_epi16(th, h1, 0xf0);
122     h1 = _mm_alignr_epi8(h1, th, 8);
123 
124     _mm_storeu_si128((__m128i *)ctx->h, h0);
125     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126 }
127 
128 void
SHA256_Update_Native(SHA256Context * ctx,const unsigned char * input,unsigned int inputLen)129 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
130                      unsigned int inputLen)
131 {
132     __m128i h0, h1, th;
133     const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134 
135     const __m128i *K = (__m128i *)K256;
136     const __m128i k0 = _mm_load_si128(K);
137     const __m128i k1 = _mm_load_si128(K + 1);
138     const __m128i k2 = _mm_load_si128(K + 2);
139     const __m128i k3 = _mm_load_si128(K + 3);
140     const __m128i k4 = _mm_load_si128(K + 4);
141     const __m128i k5 = _mm_load_si128(K + 5);
142     const __m128i k6 = _mm_load_si128(K + 6);
143     const __m128i k7 = _mm_load_si128(K + 7);
144     const __m128i k8 = _mm_load_si128(K + 8);
145     const __m128i k9 = _mm_load_si128(K + 9);
146     const __m128i k10 = _mm_load_si128(K + 10);
147     const __m128i k11 = _mm_load_si128(K + 11);
148     const __m128i k12 = _mm_load_si128(K + 12);
149     const __m128i k13 = _mm_load_si128(K + 13);
150     const __m128i k14 = _mm_load_si128(K + 14);
151     const __m128i k15 = _mm_load_si128(K + 15);
152 
153     unsigned int inBuf = ctx->sizeLo & 0x3f;
154     if (!inputLen) {
155         return;
156     }
157 
158     /* Add inputLen into the count of bytes processed, before processing */
159     if ((ctx->sizeLo += inputLen) < inputLen) {
160         ctx->sizeHi++;
161     }
162 
163     /* if data already in buffer, attempt to fill rest of buffer */
164     if (inBuf) {
165         unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166         if (inputLen < todo) {
167             todo = inputLen;
168         }
169         memcpy(ctx->u.b + inBuf, input, todo);
170         input += todo;
171         inputLen -= todo;
172         if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173             SHA256_Compress_Native(ctx);
174         }
175     }
176 
177     h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178     h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179 
180     /* H0123:4567 -> H01256:H2367 */
181     th = _mm_shuffle_epi32(h0, 0xb1);
182     h1 = _mm_shuffle_epi32(h1, 0x1b);
183     h0 = _mm_alignr_epi8(th, h1, 8);
184     h1 = _mm_blend_epi16(h1, th, 0xf0);
185 
186     /* if enough data to fill one or more whole buffers, process them. */
187     while (inputLen >= SHA256_BLOCK_LENGTH) {
188         __m128i a, b, c, d;
189         __m128i w0, w1;
190         a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191         b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192         c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193         d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194         input += SHA256_BLOCK_LENGTH;
195         inputLen -= SHA256_BLOCK_LENGTH;
196 
197         w0 = h0;
198         w1 = h1;
199 
200         ROUND(0, a, b, c, d)
201         ROUND(1, b, c, d, a)
202         ROUND(2, c, d, a, b)
203         ROUND(3, d, a, b, c)
204         ROUND(4, a, b, c, d)
205         ROUND(5, b, c, d, a)
206         ROUND(6, c, d, a, b)
207         ROUND(7, d, a, b, c)
208         ROUND(8, a, b, c, d)
209         ROUND(9, b, c, d, a)
210         ROUND(10, c, d, a, b)
211         ROUND(11, d, a, b, c)
212         ROUND(12, a, b, c, d)
213         ROUND(13, b, c, d, a)
214         ROUND(14, c, d, a, b)
215         ROUND(15, d, a, b, c)
216 
217         h0 = _mm_add_epi32(h0, w0);
218         h1 = _mm_add_epi32(h1, w1);
219     }
220 
221     // H01234567 -> H01256 and H2367
222     th = _mm_shuffle_epi32(h0, 0x1b);
223     h1 = _mm_shuffle_epi32(h1, 0xb1);
224     h0 = _mm_blend_epi16(th, h1, 0xf0);
225     h1 = _mm_alignr_epi8(h1, th, 8);
226 
227     _mm_storeu_si128((__m128i *)ctx->h, h0);
228     _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229 
230     /* if data left over, fill it into buffer */
231     if (inputLen) {
232         memcpy(ctx->u.b, input, inputLen);
233     }
234 }
235 
236 #endif /* USE_HW_SHA2 */
237