1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 #ifdef USE_HW_SHA2
6
7 #include <immintrin.h>
8
9 #ifdef FREEBL_NO_DEPEND
10 #include "stubs.h"
11 #endif
12
13 #include "blapii.h"
14 #include "prcpucfg.h"
15 #include "prtypes.h" /* for PRUintXX */
16 #include "prlong.h"
17 #include "blapi.h"
18 #include "sha256.h"
19
20 /* SHA-256 constants, K256. */
21 pre_align static const PRUint32 K256[64] post_align = {
22 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
23 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
24 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
25 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
26 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
27 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
28 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
29 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
30 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
31 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
32 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
33 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
34 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
35 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
36 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
37 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
38 };
39
40 #define ROUND(n, a, b, c, d) \
41 { \
42 __m128i t = _mm_add_epi32(a, k##n); \
43 w1 = _mm_sha256rnds2_epu32(w1, w0, t); \
44 t = _mm_shuffle_epi32(t, 0x0e); \
45 w0 = _mm_sha256rnds2_epu32(w0, w1, t); \
46 if (n < 12) { \
47 a = _mm_sha256msg1_epu32(a, b); \
48 a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
49 a = _mm_sha256msg2_epu32(a, d); \
50 } \
51 }
52
53 void
SHA256_Compress_Native(SHA256Context * ctx)54 SHA256_Compress_Native(SHA256Context *ctx)
55 {
56 __m128i h0, h1, th;
57 __m128i a, b, c, d;
58 __m128i w0, w1;
59 const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
60
61 const __m128i *K = (__m128i *)K256;
62 const __m128i k0 = _mm_load_si128(K);
63 const __m128i k1 = _mm_load_si128(K + 1);
64 const __m128i k2 = _mm_load_si128(K + 2);
65 const __m128i k3 = _mm_load_si128(K + 3);
66 const __m128i k4 = _mm_load_si128(K + 4);
67 const __m128i k5 = _mm_load_si128(K + 5);
68 const __m128i k6 = _mm_load_si128(K + 6);
69 const __m128i k7 = _mm_load_si128(K + 7);
70 const __m128i k8 = _mm_load_si128(K + 8);
71 const __m128i k9 = _mm_load_si128(K + 9);
72 const __m128i k10 = _mm_load_si128(K + 10);
73 const __m128i k11 = _mm_load_si128(K + 11);
74 const __m128i k12 = _mm_load_si128(K + 12);
75 const __m128i k13 = _mm_load_si128(K + 13);
76 const __m128i k14 = _mm_load_si128(K + 14);
77 const __m128i k15 = _mm_load_si128(K + 15);
78
79 const __m128i *input = (__m128i *)ctx->u.b;
80
81 h0 = _mm_loadu_si128((__m128i *)(ctx->h));
82 h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
83
84 /* H0123:4567 -> H01256:H2367 */
85 th = _mm_shuffle_epi32(h0, 0xb1);
86 h1 = _mm_shuffle_epi32(h1, 0x1b);
87 h0 = _mm_alignr_epi8(th, h1, 8);
88 h1 = _mm_blend_epi16(h1, th, 0xf0);
89
90 a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
91 b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
92 c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
93 d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
94
95 w0 = h0;
96 w1 = h1;
97
98 ROUND(0, a, b, c, d)
99 ROUND(1, b, c, d, a)
100 ROUND(2, c, d, a, b)
101 ROUND(3, d, a, b, c)
102 ROUND(4, a, b, c, d)
103 ROUND(5, b, c, d, a)
104 ROUND(6, c, d, a, b)
105 ROUND(7, d, a, b, c)
106 ROUND(8, a, b, c, d)
107 ROUND(9, b, c, d, a)
108 ROUND(10, c, d, a, b)
109 ROUND(11, d, a, b, c)
110 ROUND(12, a, b, c, d)
111 ROUND(13, b, c, d, a)
112 ROUND(14, c, d, a, b)
113 ROUND(15, d, a, b, c)
114
115 h0 = _mm_add_epi32(h0, w0);
116 h1 = _mm_add_epi32(h1, w1);
117
118 /* H0145:2367 -> H0123:4567 */
119 th = _mm_shuffle_epi32(h0, 0x1b);
120 h1 = _mm_shuffle_epi32(h1, 0xb1);
121 h0 = _mm_blend_epi16(th, h1, 0xf0);
122 h1 = _mm_alignr_epi8(h1, th, 8);
123
124 _mm_storeu_si128((__m128i *)ctx->h, h0);
125 _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
126 }
127
128 void
SHA256_Update_Native(SHA256Context * ctx,const unsigned char * input,unsigned int inputLen)129 SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
130 unsigned int inputLen)
131 {
132 __m128i h0, h1, th;
133 const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
134
135 const __m128i *K = (__m128i *)K256;
136 const __m128i k0 = _mm_load_si128(K);
137 const __m128i k1 = _mm_load_si128(K + 1);
138 const __m128i k2 = _mm_load_si128(K + 2);
139 const __m128i k3 = _mm_load_si128(K + 3);
140 const __m128i k4 = _mm_load_si128(K + 4);
141 const __m128i k5 = _mm_load_si128(K + 5);
142 const __m128i k6 = _mm_load_si128(K + 6);
143 const __m128i k7 = _mm_load_si128(K + 7);
144 const __m128i k8 = _mm_load_si128(K + 8);
145 const __m128i k9 = _mm_load_si128(K + 9);
146 const __m128i k10 = _mm_load_si128(K + 10);
147 const __m128i k11 = _mm_load_si128(K + 11);
148 const __m128i k12 = _mm_load_si128(K + 12);
149 const __m128i k13 = _mm_load_si128(K + 13);
150 const __m128i k14 = _mm_load_si128(K + 14);
151 const __m128i k15 = _mm_load_si128(K + 15);
152
153 unsigned int inBuf = ctx->sizeLo & 0x3f;
154 if (!inputLen) {
155 return;
156 }
157
158 /* Add inputLen into the count of bytes processed, before processing */
159 if ((ctx->sizeLo += inputLen) < inputLen) {
160 ctx->sizeHi++;
161 }
162
163 /* if data already in buffer, attempt to fill rest of buffer */
164 if (inBuf) {
165 unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
166 if (inputLen < todo) {
167 todo = inputLen;
168 }
169 memcpy(ctx->u.b + inBuf, input, todo);
170 input += todo;
171 inputLen -= todo;
172 if (inBuf + todo == SHA256_BLOCK_LENGTH) {
173 SHA256_Compress_Native(ctx);
174 }
175 }
176
177 h0 = _mm_loadu_si128((__m128i *)(ctx->h));
178 h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
179
180 /* H0123:4567 -> H01256:H2367 */
181 th = _mm_shuffle_epi32(h0, 0xb1);
182 h1 = _mm_shuffle_epi32(h1, 0x1b);
183 h0 = _mm_alignr_epi8(th, h1, 8);
184 h1 = _mm_blend_epi16(h1, th, 0xf0);
185
186 /* if enough data to fill one or more whole buffers, process them. */
187 while (inputLen >= SHA256_BLOCK_LENGTH) {
188 __m128i a, b, c, d;
189 __m128i w0, w1;
190 a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
191 b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
192 c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
193 d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
194 input += SHA256_BLOCK_LENGTH;
195 inputLen -= SHA256_BLOCK_LENGTH;
196
197 w0 = h0;
198 w1 = h1;
199
200 ROUND(0, a, b, c, d)
201 ROUND(1, b, c, d, a)
202 ROUND(2, c, d, a, b)
203 ROUND(3, d, a, b, c)
204 ROUND(4, a, b, c, d)
205 ROUND(5, b, c, d, a)
206 ROUND(6, c, d, a, b)
207 ROUND(7, d, a, b, c)
208 ROUND(8, a, b, c, d)
209 ROUND(9, b, c, d, a)
210 ROUND(10, c, d, a, b)
211 ROUND(11, d, a, b, c)
212 ROUND(12, a, b, c, d)
213 ROUND(13, b, c, d, a)
214 ROUND(14, c, d, a, b)
215 ROUND(15, d, a, b, c)
216
217 h0 = _mm_add_epi32(h0, w0);
218 h1 = _mm_add_epi32(h1, w1);
219 }
220
221 // H01234567 -> H01256 and H2367
222 th = _mm_shuffle_epi32(h0, 0x1b);
223 h1 = _mm_shuffle_epi32(h1, 0xb1);
224 h0 = _mm_blend_epi16(th, h1, 0xf0);
225 h1 = _mm_alignr_epi8(h1, th, 8);
226
227 _mm_storeu_si128((__m128i *)ctx->h, h0);
228 _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
229
230 /* if data left over, fill it into buffer */
231 if (inputLen) {
232 memcpy(ctx->u.b, input, inputLen);
233 }
234 }
235
236 #endif /* USE_HW_SHA2 */
237