1 /* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
2 /*
3  * Skein implementation.
4  *
5  * ==========================(LICENSE BEGIN)============================
6  *
7  * Copyright (c) 2007-2010  Projet RNRT SAPHIR
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining
10  * a copy of this software and associated documentation files (the
11  * "Software"), to deal in the Software without restriction, including
12  * without limitation the rights to use, copy, modify, merge, publish,
13  * distribute, sublicense, and/or sell copies of the Software, and to
14  * permit persons to whom the Software is furnished to do so, subject to
15  * the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27  *
28  * ===========================(LICENSE END)=============================
29  *
30  * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
31  */
32 
33 #include <stddef.h>
34 #include <string.h>
35 
36 #include "sph_skein.h"
37 
38 
39 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
40 #define SPH_SMALL_FOOTPRINT_SKEIN   1
41 #endif
42 
43 #ifdef _MSC_VER
44 #pragma warning (disable: 4146)
45 #endif
46 
47 #if SPH_64
48 
49 #if 0
50 /* obsolete */
51 /*
52  * M5_ ## s ## _ ## i  evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3).
53  */
54 
55 #define M5_0_0    0
56 #define M5_0_1    1
57 #define M5_0_2    2
58 #define M5_0_3    3
59 
60 #define M5_1_0    1
61 #define M5_1_1    2
62 #define M5_1_2    3
63 #define M5_1_3    4
64 
65 #define M5_2_0    2
66 #define M5_2_1    3
67 #define M5_2_2    4
68 #define M5_2_3    0
69 
70 #define M5_3_0    3
71 #define M5_3_1    4
72 #define M5_3_2    0
73 #define M5_3_3    1
74 
75 #define M5_4_0    4
76 #define M5_4_1    0
77 #define M5_4_2    1
78 #define M5_4_3    2
79 
80 #define M5_5_0    0
81 #define M5_5_1    1
82 #define M5_5_2    2
83 #define M5_5_3    3
84 
85 #define M5_6_0    1
86 #define M5_6_1    2
87 #define M5_6_2    3
88 #define M5_6_3    4
89 
90 #define M5_7_0    2
91 #define M5_7_1    3
92 #define M5_7_2    4
93 #define M5_7_3    0
94 
95 #define M5_8_0    3
96 #define M5_8_1    4
97 #define M5_8_2    0
98 #define M5_8_3    1
99 
100 #define M5_9_0    4
101 #define M5_9_1    0
102 #define M5_9_2    1
103 #define M5_9_3    2
104 
105 #define M5_10_0   0
106 #define M5_10_1   1
107 #define M5_10_2   2
108 #define M5_10_3   3
109 
110 #define M5_11_0   1
111 #define M5_11_1   2
112 #define M5_11_2   3
113 #define M5_11_3   4
114 
115 #define M5_12_0   2
116 #define M5_12_1   3
117 #define M5_12_2   4
118 #define M5_12_3   0
119 
120 #define M5_13_0   3
121 #define M5_13_1   4
122 #define M5_13_2   0
123 #define M5_13_3   1
124 
125 #define M5_14_0   4
126 #define M5_14_1   0
127 #define M5_14_2   1
128 #define M5_14_3   2
129 
130 #define M5_15_0   0
131 #define M5_15_1   1
132 #define M5_15_2   2
133 #define M5_15_3   3
134 
135 #define M5_16_0   1
136 #define M5_16_1   2
137 #define M5_16_2   3
138 #define M5_16_3   4
139 
140 #define M5_17_0   2
141 #define M5_17_1   3
142 #define M5_17_2   4
143 #define M5_17_3   0
144 
145 #define M5_18_0   3
146 #define M5_18_1   4
147 #define M5_18_2   0
148 #define M5_18_3   1
149 #endif
150 
151 /*
152  * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
153  */
154 
155 #define M9_0_0    0
156 #define M9_0_1    1
157 #define M9_0_2    2
158 #define M9_0_3    3
159 #define M9_0_4    4
160 #define M9_0_5    5
161 #define M9_0_6    6
162 #define M9_0_7    7
163 
164 #define M9_1_0    1
165 #define M9_1_1    2
166 #define M9_1_2    3
167 #define M9_1_3    4
168 #define M9_1_4    5
169 #define M9_1_5    6
170 #define M9_1_6    7
171 #define M9_1_7    8
172 
173 #define M9_2_0    2
174 #define M9_2_1    3
175 #define M9_2_2    4
176 #define M9_2_3    5
177 #define M9_2_4    6
178 #define M9_2_5    7
179 #define M9_2_6    8
180 #define M9_2_7    0
181 
182 #define M9_3_0    3
183 #define M9_3_1    4
184 #define M9_3_2    5
185 #define M9_3_3    6
186 #define M9_3_4    7
187 #define M9_3_5    8
188 #define M9_3_6    0
189 #define M9_3_7    1
190 
191 #define M9_4_0    4
192 #define M9_4_1    5
193 #define M9_4_2    6
194 #define M9_4_3    7
195 #define M9_4_4    8
196 #define M9_4_5    0
197 #define M9_4_6    1
198 #define M9_4_7    2
199 
200 #define M9_5_0    5
201 #define M9_5_1    6
202 #define M9_5_2    7
203 #define M9_5_3    8
204 #define M9_5_4    0
205 #define M9_5_5    1
206 #define M9_5_6    2
207 #define M9_5_7    3
208 
209 #define M9_6_0    6
210 #define M9_6_1    7
211 #define M9_6_2    8
212 #define M9_6_3    0
213 #define M9_6_4    1
214 #define M9_6_5    2
215 #define M9_6_6    3
216 #define M9_6_7    4
217 
218 #define M9_7_0    7
219 #define M9_7_1    8
220 #define M9_7_2    0
221 #define M9_7_3    1
222 #define M9_7_4    2
223 #define M9_7_5    3
224 #define M9_7_6    4
225 #define M9_7_7    5
226 
227 #define M9_8_0    8
228 #define M9_8_1    0
229 #define M9_8_2    1
230 #define M9_8_3    2
231 #define M9_8_4    3
232 #define M9_8_5    4
233 #define M9_8_6    5
234 #define M9_8_7    6
235 
236 #define M9_9_0    0
237 #define M9_9_1    1
238 #define M9_9_2    2
239 #define M9_9_3    3
240 #define M9_9_4    4
241 #define M9_9_5    5
242 #define M9_9_6    6
243 #define M9_9_7    7
244 
245 #define M9_10_0   1
246 #define M9_10_1   2
247 #define M9_10_2   3
248 #define M9_10_3   4
249 #define M9_10_4   5
250 #define M9_10_5   6
251 #define M9_10_6   7
252 #define M9_10_7   8
253 
254 #define M9_11_0   2
255 #define M9_11_1   3
256 #define M9_11_2   4
257 #define M9_11_3   5
258 #define M9_11_4   6
259 #define M9_11_5   7
260 #define M9_11_6   8
261 #define M9_11_7   0
262 
263 #define M9_12_0   3
264 #define M9_12_1   4
265 #define M9_12_2   5
266 #define M9_12_3   6
267 #define M9_12_4   7
268 #define M9_12_5   8
269 #define M9_12_6   0
270 #define M9_12_7   1
271 
272 #define M9_13_0   4
273 #define M9_13_1   5
274 #define M9_13_2   6
275 #define M9_13_3   7
276 #define M9_13_4   8
277 #define M9_13_5   0
278 #define M9_13_6   1
279 #define M9_13_7   2
280 
281 #define M9_14_0   5
282 #define M9_14_1   6
283 #define M9_14_2   7
284 #define M9_14_3   8
285 #define M9_14_4   0
286 #define M9_14_5   1
287 #define M9_14_6   2
288 #define M9_14_7   3
289 
290 #define M9_15_0   6
291 #define M9_15_1   7
292 #define M9_15_2   8
293 #define M9_15_3   0
294 #define M9_15_4   1
295 #define M9_15_5   2
296 #define M9_15_6   3
297 #define M9_15_7   4
298 
299 #define M9_16_0   7
300 #define M9_16_1   8
301 #define M9_16_2   0
302 #define M9_16_3   1
303 #define M9_16_4   2
304 #define M9_16_5   3
305 #define M9_16_6   4
306 #define M9_16_7   5
307 
308 #define M9_17_0   8
309 #define M9_17_1   0
310 #define M9_17_2   1
311 #define M9_17_3   2
312 #define M9_17_4   3
313 #define M9_17_5   4
314 #define M9_17_6   5
315 #define M9_17_7   6
316 
317 #define M9_18_0   0
318 #define M9_18_1   1
319 #define M9_18_2   2
320 #define M9_18_3   3
321 #define M9_18_4   4
322 #define M9_18_5   5
323 #define M9_18_6   6
324 #define M9_18_7   7
325 
326 /*
327  * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
328  */
329 
330 #define M3_0_0    0
331 #define M3_0_1    1
332 #define M3_1_0    1
333 #define M3_1_1    2
334 #define M3_2_0    2
335 #define M3_2_1    0
336 #define M3_3_0    0
337 #define M3_3_1    1
338 #define M3_4_0    1
339 #define M3_4_1    2
340 #define M3_5_0    2
341 #define M3_5_1    0
342 #define M3_6_0    0
343 #define M3_6_1    1
344 #define M3_7_0    1
345 #define M3_7_1    2
346 #define M3_8_0    2
347 #define M3_8_1    0
348 #define M3_9_0    0
349 #define M3_9_1    1
350 #define M3_10_0   1
351 #define M3_10_1   2
352 #define M3_11_0   2
353 #define M3_11_1   0
354 #define M3_12_0   0
355 #define M3_12_1   1
356 #define M3_13_0   1
357 #define M3_13_1   2
358 #define M3_14_0   2
359 #define M3_14_1   0
360 #define M3_15_0   0
361 #define M3_15_1   1
362 #define M3_16_0   1
363 #define M3_16_1   2
364 #define M3_17_0   2
365 #define M3_17_1   0
366 #define M3_18_0   0
367 #define M3_18_1   1
368 
369 #define XCAT(x, y)     XCAT_(x, y)
370 #define XCAT_(x, y)    x ## y
371 
372 #if 0
373 /* obsolete */
374 #define SKSI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i))
375 #define SKST(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
376 #endif
377 
378 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
379 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
380 
381 #if 0
382 /* obsolete */
383 #define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2)   do { \
384 		k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \
385 		t2 = t0 ^ t1; \
386 	} while (0)
387 #endif
388 
389 #define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2)   do { \
390 		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
391 			^ SPH_C64(0x1BD11BDAA9FC1A22); \
392 		t2 = t0 ^ t1; \
393 	} while (0)
394 
395 #if 0
396 /* obsolete */
397 #define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s)   do { \
398 		w0 = SPH_T64(w0 + SKSI(k, s, 0)); \
399 		w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \
400 		w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \
401 		w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \
402 	} while (0)
403 #endif
404 
405 #if SPH_SMALL_FOOTPRINT_SKEIN
406 
407 #define TFBIG_ADDKEY(s, tt0, tt1)   do { \
408 		p0 = SPH_T64(p0 + h[s + 0]); \
409 		p1 = SPH_T64(p1 + h[s + 1]); \
410 		p2 = SPH_T64(p2 + h[s + 2]); \
411 		p3 = SPH_T64(p3 + h[s + 3]); \
412 		p4 = SPH_T64(p4 + h[s + 4]); \
413 		p5 = SPH_T64(p5 + h[s + 5] + tt0); \
414 		p6 = SPH_T64(p6 + h[s + 6] + tt1); \
415 		p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \
416 	} while (0)
417 
418 #else
419 
420 #define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s)   do { \
421 		w0 = SPH_T64(w0 + SKBI(k, s, 0)); \
422 		w1 = SPH_T64(w1 + SKBI(k, s, 1)); \
423 		w2 = SPH_T64(w2 + SKBI(k, s, 2)); \
424 		w3 = SPH_T64(w3 + SKBI(k, s, 3)); \
425 		w4 = SPH_T64(w4 + SKBI(k, s, 4)); \
426 		w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
427 		w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
428 		w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \
429 	} while (0)
430 
431 #endif
432 
433 #if 0
434 /* obsolete */
435 #define TFSMALL_MIX(x0, x1, rc)   do { \
436 		x0 = SPH_T64(x0 + x1); \
437 		x1 = SPH_ROTL64(x1, rc) ^ x0; \
438 	} while (0)
439 #endif
440 
441 #define TFBIG_MIX(x0, x1, rc)   do { \
442 		x0 = SPH_T64(x0 + x1); \
443 		x1 = SPH_ROTL64(x1, rc) ^ x0; \
444 	} while (0)
445 
446 #if 0
447 /* obsolete */
448 #define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1)  do { \
449 		TFSMALL_MIX(w0, w1, rc0); \
450 		TFSMALL_MIX(w2, w3, rc1); \
451 	} while (0)
452 #endif
453 
454 #define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
455 		TFBIG_MIX(w0, w1, rc0); \
456 		TFBIG_MIX(w2, w3, rc1); \
457 		TFBIG_MIX(w4, w5, rc2); \
458 		TFBIG_MIX(w6, w7, rc3); \
459 	} while (0)
460 
461 #if 0
462 /* obsolete */
463 #define TFSMALL_4e(s)   do { \
464 		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
465 		TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \
466 		TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \
467 		TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \
468 		TFSMALL_MIX4(p0, p3, p2, p1,  5, 37); \
469 	} while (0)
470 
471 #define TFSMALL_4o(s)   do { \
472 		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \
473 		TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \
474 		TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \
475 		TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \
476 		TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \
477 	} while (0)
478 #endif
479 
480 #if SPH_SMALL_FOOTPRINT_SKEIN
481 
482 #define TFBIG_4e(s)   do { \
483 		TFBIG_ADDKEY(s, t0, t1); \
484 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
485 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
486 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
487 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
488 	} while (0)
489 
490 #define TFBIG_4o(s)   do { \
491 		TFBIG_ADDKEY(s, t1, t2); \
492 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
493 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
494 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
495 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
496 	} while (0)
497 
498 #else
499 
500 #define TFBIG_4e(s)   do { \
501 		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
502 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
503 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
504 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
505 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
506 	} while (0)
507 
508 #define TFBIG_4o(s)   do { \
509 		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
510 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
511 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
512 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
513 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
514 	} while (0)
515 
516 #endif
517 
518 #if 0
519 /* obsolete */
520 #define UBI_SMALL(etype, extra)  do { \
521 		sph_u64 h4, t0, t1, t2; \
522 		sph_u64 m0 = sph_dec64le(buf +  0); \
523 		sph_u64 m1 = sph_dec64le(buf +  8); \
524 		sph_u64 m2 = sph_dec64le(buf + 16); \
525 		sph_u64 m3 = sph_dec64le(buf + 24); \
526 		sph_u64 p0 = m0; \
527 		sph_u64 p1 = m1; \
528 		sph_u64 p2 = m2; \
529 		sph_u64 p3 = m3; \
530 		t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \
531 		t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \
532 		TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \
533 		TFSMALL_4e(0); \
534 		TFSMALL_4o(1); \
535 		TFSMALL_4e(2); \
536 		TFSMALL_4o(3); \
537 		TFSMALL_4e(4); \
538 		TFSMALL_4o(5); \
539 		TFSMALL_4e(6); \
540 		TFSMALL_4o(7); \
541 		TFSMALL_4e(8); \
542 		TFSMALL_4o(9); \
543 		TFSMALL_4e(10); \
544 		TFSMALL_4o(11); \
545 		TFSMALL_4e(12); \
546 		TFSMALL_4o(13); \
547 		TFSMALL_4e(14); \
548 		TFSMALL_4o(15); \
549 		TFSMALL_4e(16); \
550 		TFSMALL_4o(17); \
551 		TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \
552 		h0 = m0 ^ p0; \
553 		h1 = m1 ^ p1; \
554 		h2 = m2 ^ p2; \
555 		h3 = m3 ^ p3; \
556 	} while (0)
557 #endif
558 
559 #if SPH_SMALL_FOOTPRINT_SKEIN
560 
561 #define UBI_BIG(etype, extra)  do { \
562 		sph_u64 t0, t1, t2; \
563 		unsigned u; \
564 		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
565 		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
566 		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
567 		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
568 		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
569 		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
570 		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
571 		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
572 		sph_u64 p0 = m0; \
573 		sph_u64 p1 = m1; \
574 		sph_u64 p2 = m2; \
575 		sph_u64 p3 = m3; \
576 		sph_u64 p4 = m4; \
577 		sph_u64 p5 = m5; \
578 		sph_u64 p6 = m6; \
579 		sph_u64 p7 = m7; \
580 		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
581 		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
582 		TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \
583 			h[6], h[7], h[8], t0, t1, t2); \
584 		for (u = 0; u <= 15; u += 3) { \
585 			h[u +  9] = h[u + 0]; \
586 			h[u + 10] = h[u + 1]; \
587 			h[u + 11] = h[u + 2]; \
588 		} \
589 		for (u = 0; u < 9; u ++) { \
590 			sph_u64 s = u << 1; \
591 			sph_u64 tmp; \
592 			TFBIG_4e(s); \
593 			TFBIG_4o(s + 1); \
594 			tmp = t2; \
595 			t2 = t1; \
596 			t1 = t0; \
597 			t0 = tmp; \
598 		} \
599 		TFBIG_ADDKEY(18, t0, t1); \
600 		h[0] = m0 ^ p0; \
601 		h[1] = m1 ^ p1; \
602 		h[2] = m2 ^ p2; \
603 		h[3] = m3 ^ p3; \
604 		h[4] = m4 ^ p4; \
605 		h[5] = m5 ^ p5; \
606 		h[6] = m6 ^ p6; \
607 		h[7] = m7 ^ p7; \
608 	} while (0)
609 
610 #else
611 
612 #define UBI_BIG(etype, extra)  do { \
613 		sph_u64 h8, t0, t1, t2; \
614 		sph_u64 m0 = sph_dec64le_aligned(buf +  0); \
615 		sph_u64 m1 = sph_dec64le_aligned(buf +  8); \
616 		sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
617 		sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
618 		sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
619 		sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
620 		sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
621 		sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
622 		sph_u64 p0 = m0; \
623 		sph_u64 p1 = m1; \
624 		sph_u64 p2 = m2; \
625 		sph_u64 p3 = m3; \
626 		sph_u64 p4 = m4; \
627 		sph_u64 p5 = m5; \
628 		sph_u64 p6 = m6; \
629 		sph_u64 p7 = m7; \
630 		t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
631 		t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
632 		TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
633 		TFBIG_4e(0); \
634 		TFBIG_4o(1); \
635 		TFBIG_4e(2); \
636 		TFBIG_4o(3); \
637 		TFBIG_4e(4); \
638 		TFBIG_4o(5); \
639 		TFBIG_4e(6); \
640 		TFBIG_4o(7); \
641 		TFBIG_4e(8); \
642 		TFBIG_4o(9); \
643 		TFBIG_4e(10); \
644 		TFBIG_4o(11); \
645 		TFBIG_4e(12); \
646 		TFBIG_4o(13); \
647 		TFBIG_4e(14); \
648 		TFBIG_4o(15); \
649 		TFBIG_4e(16); \
650 		TFBIG_4o(17); \
651 		TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
652 		h0 = m0 ^ p0; \
653 		h1 = m1 ^ p1; \
654 		h2 = m2 ^ p2; \
655 		h3 = m3 ^ p3; \
656 		h4 = m4 ^ p4; \
657 		h5 = m5 ^ p5; \
658 		h6 = m6 ^ p6; \
659 		h7 = m7 ^ p7; \
660 	} while (0)
661 
662 #endif
663 
664 #if 0
665 /* obsolete */
666 #define DECL_STATE_SMALL \
667 	sph_u64 h0, h1, h2, h3; \
668 	sph_u64 bcount;
669 
670 #define READ_STATE_SMALL(sc)   do { \
671 		h0 = (sc)->h0; \
672 		h1 = (sc)->h1; \
673 		h2 = (sc)->h2; \
674 		h3 = (sc)->h3; \
675 		bcount = sc->bcount; \
676 	} while (0)
677 
678 #define WRITE_STATE_SMALL(sc)   do { \
679 		(sc)->h0 = h0; \
680 		(sc)->h1 = h1; \
681 		(sc)->h2 = h2; \
682 		(sc)->h3 = h3; \
683 		sc->bcount = bcount; \
684 	} while (0)
685 #endif
686 
687 #if SPH_SMALL_FOOTPRINT_SKEIN
688 
689 #define DECL_STATE_BIG \
690 	sph_u64 h[27]; \
691 	sph_u64 bcount;
692 
693 #define READ_STATE_BIG(sc)   do { \
694 		h[0] = (sc)->h0; \
695 		h[1] = (sc)->h1; \
696 		h[2] = (sc)->h2; \
697 		h[3] = (sc)->h3; \
698 		h[4] = (sc)->h4; \
699 		h[5] = (sc)->h5; \
700 		h[6] = (sc)->h6; \
701 		h[7] = (sc)->h7; \
702 		bcount = sc->bcount; \
703 	} while (0)
704 
705 #define WRITE_STATE_BIG(sc)   do { \
706 		(sc)->h0 = h[0]; \
707 		(sc)->h1 = h[1]; \
708 		(sc)->h2 = h[2]; \
709 		(sc)->h3 = h[3]; \
710 		(sc)->h4 = h[4]; \
711 		(sc)->h5 = h[5]; \
712 		(sc)->h6 = h[6]; \
713 		(sc)->h7 = h[7]; \
714 		sc->bcount = bcount; \
715 	} while (0)
716 
717 #else
718 
719 #define DECL_STATE_BIG \
720 	sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \
721 	sph_u64 bcount;
722 
723 #define READ_STATE_BIG(sc)   do { \
724 		h0 = (sc)->h0; \
725 		h1 = (sc)->h1; \
726 		h2 = (sc)->h2; \
727 		h3 = (sc)->h3; \
728 		h4 = (sc)->h4; \
729 		h5 = (sc)->h5; \
730 		h6 = (sc)->h6; \
731 		h7 = (sc)->h7; \
732 		bcount = sc->bcount; \
733 	} while (0)
734 
735 #define WRITE_STATE_BIG(sc)   do { \
736 		(sc)->h0 = h0; \
737 		(sc)->h1 = h1; \
738 		(sc)->h2 = h2; \
739 		(sc)->h3 = h3; \
740 		(sc)->h4 = h4; \
741 		(sc)->h5 = h5; \
742 		(sc)->h6 = h6; \
743 		(sc)->h7 = h7; \
744 		sc->bcount = bcount; \
745 	} while (0)
746 
747 #endif
748 
749 #if 0
750 /* obsolete */
751 static void
752 skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv)
753 {
754 	sc->h0 = iv[0];
755 	sc->h1 = iv[1];
756 	sc->h2 = iv[2];
757 	sc->h3 = iv[3];
758 	sc->bcount = 0;
759 	sc->ptr = 0;
760 }
761 #endif
762 
763 static void
skein_big_init(sph_skein_big_context * sc,const sph_u64 * iv)764 skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv)
765 {
766 	sc->h0 = iv[0];
767 	sc->h1 = iv[1];
768 	sc->h2 = iv[2];
769 	sc->h3 = iv[3];
770 	sc->h4 = iv[4];
771 	sc->h5 = iv[5];
772 	sc->h6 = iv[6];
773 	sc->h7 = iv[7];
774 	sc->bcount = 0;
775 	sc->ptr = 0;
776 }
777 
778 #if 0
779 /* obsolete */
780 static void
781 skein_small_core(sph_skein_small_context *sc, const void *data, size_t len)
782 {
783 	unsigned char *buf;
784 	size_t ptr, clen;
785 	unsigned first;
786 	DECL_STATE_SMALL
787 
788 	buf = sc->buf;
789 	ptr = sc->ptr;
790 	clen = (sizeof sc->buf) - ptr;
791 	if (len <= clen) {
792 		memcpy(buf + ptr, data, len);
793 		sc->ptr = ptr + len;
794 		return;
795 	}
796 	if (clen != 0) {
797 		memcpy(buf + ptr, data, clen);
798 		data = (const unsigned char *)data + clen;
799 		len -= clen;
800 	}
801 
802 #if SPH_SMALL_FOOTPRINT_SKEIN
803 
804 	READ_STATE_SMALL(sc);
805 	first = (bcount == 0) << 7;
806 	for (;;) {
807 		bcount ++;
808 		UBI_SMALL(96 + first, 0);
809 		if (len <= sizeof sc->buf)
810 			break;
811 		first = 0;
812 		memcpy(buf, data, sizeof sc->buf);
813 		data = (const unsigned char *)data + sizeof sc->buf;
814 		len -= sizeof sc->buf;
815 	}
816 	WRITE_STATE_SMALL(sc);
817 	sc->ptr = len;
818 	memcpy(buf, data, len);
819 
820 #else
821 
822 	/*
823 	 * Unrolling the loop yields a slight performance boost, while
824 	 * keeping the code size around 24 kB on 32-bit x86.
825 	 */
826 	READ_STATE_SMALL(sc);
827 	first = (bcount == 0) << 7;
828 	for (;;) {
829 		bcount ++;
830 		UBI_SMALL(96 + first, 0);
831 		if (len <= sizeof sc->buf)
832 			break;
833 		buf = (unsigned char *)data;
834 		bcount ++;
835 		UBI_SMALL(96, 0);
836 		if (len <= 2 * sizeof sc->buf) {
837 			data = buf + sizeof sc->buf;
838 			len -= sizeof sc->buf;
839 			break;
840 		}
841 		buf += sizeof sc->buf;
842 		data = buf + sizeof sc->buf;
843 		first = 0;
844 		len -= 2 * sizeof sc->buf;
845 	}
846 	WRITE_STATE_SMALL(sc);
847 	sc->ptr = len;
848 	memcpy(sc->buf, data, len);
849 
850 #endif
851 }
852 #endif
853 
854 static void
skein_big_core(sph_skein_big_context * sc,const void * data,size_t len)855 skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
856 {
857 	/*
858 	 * The Skein "final bit" in the tweak is troublesome here,
859 	 * because if the input has a length which is a multiple of the
860 	 * block size (512 bits) then that bit must be set for the
861 	 * final block, which is full of message bits (padding in
862 	 * Skein can be reduced to no extra bit at all). However, this
863 	 * function cannot know whether it processes the last chunks of
864 	 * the message or not. Hence we may keep a full block of buffered
865 	 * data (64 bytes).
866 	 */
867 	unsigned char *buf;
868 	size_t ptr;
869 	unsigned first;
870 	DECL_STATE_BIG
871 
872 	buf = sc->buf;
873 	ptr = sc->ptr;
874 	if (len <= (sizeof sc->buf) - ptr) {
875 		memcpy(buf + ptr, data, len);
876 		ptr += len;
877 		sc->ptr = ptr;
878 		return;
879 	}
880 
881 	READ_STATE_BIG(sc);
882 	first = (bcount == 0) << 7;
883 	do {
884 		size_t clen;
885 
886 		if (ptr == sizeof sc->buf) {
887 			bcount ++;
888 			UBI_BIG(96 + first, 0);
889 			first = 0;
890 			ptr = 0;
891 		}
892 		clen = (sizeof sc->buf) - ptr;
893 		if (clen > len)
894 			clen = len;
895 		memcpy(buf + ptr, data, clen);
896 		ptr += clen;
897 		data = (const unsigned char *)data + clen;
898 		len -= clen;
899 	} while (len > 0);
900 	WRITE_STATE_BIG(sc);
901 	sc->ptr = ptr;
902 }
903 
904 #if 0
905 /* obsolete */
906 static void
907 skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n,
908 	void *dst, size_t out_len)
909 {
910 	unsigned char *buf;
911 	size_t ptr;
912 	unsigned et;
913 	int i;
914 	DECL_STATE_SMALL
915 
916 	if (n != 0) {
917 		unsigned z;
918 		unsigned char x;
919 
920 		z = 0x80 >> n;
921 		x = ((ub & -z) | z) & 0xFF;
922 		skein_small_core(sc, &x, 1);
923 	}
924 
925 	buf = sc->buf;
926 	ptr = sc->ptr;
927 	READ_STATE_SMALL(sc);
928 	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
929 	et = 352 + ((bcount == 0) << 7) + (n != 0);
930 	for (i = 0; i < 2; i ++) {
931 		UBI_SMALL(et, ptr);
932 		if (i == 0) {
933 			memset(buf, 0, sizeof sc->buf);
934 			bcount = 0;
935 			et = 510;
936 			ptr = 8;
937 		}
938 	}
939 
940 	sph_enc64le_aligned(buf +  0, h0);
941 	sph_enc64le_aligned(buf +  8, h1);
942 	sph_enc64le_aligned(buf + 16, h2);
943 	sph_enc64le_aligned(buf + 24, h3);
944 	memcpy(dst, buf, out_len);
945 }
946 #endif
947 
948 static void
skein_big_close(sph_skein_big_context * sc,unsigned ub,unsigned n,void * dst,size_t out_len)949 skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n,
950 	void *dst, size_t out_len)
951 {
952 	unsigned char *buf;
953 	size_t ptr;
954 	unsigned et;
955 	int i;
956 #if SPH_SMALL_FOOTPRINT_SKEIN
957 	size_t u;
958 #endif
959 	DECL_STATE_BIG
960 
961 	/*
962 	 * Add bit padding if necessary.
963 	 */
964 	if (n != 0) {
965 		unsigned z;
966 		unsigned char x;
967 
968 		z = 0x80 >> n;
969 		x = ((ub & -z) | z) & 0xFF;
970 		skein_big_core(sc, &x, 1);
971 	}
972 
973 	buf = sc->buf;
974 	ptr = sc->ptr;
975 
976 	/*
977 	 * At that point, if ptr == 0, then the message was empty;
978 	 * otherwise, there is between 1 and 64 bytes (inclusive) which
979 	 * are yet to be processed. Either way, we complete the buffer
980 	 * to a full block with zeros (the Skein specification mandates
981 	 * that an empty message is padded so that there is at least
982 	 * one block to process).
983 	 *
984 	 * Once this block has been processed, we do it again, with
985 	 * a block full of zeros, for the output (that block contains
986 	 * the encoding of "0", over 8 bytes, then padded with zeros).
987 	 */
988 	READ_STATE_BIG(sc);
989 	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
990 	et = 352 + ((bcount == 0) << 7) + (n != 0);
991 	for (i = 0; i < 2; i ++) {
992 		UBI_BIG(et, ptr);
993 		if (i == 0) {
994 			memset(buf, 0, sizeof sc->buf);
995 			bcount = 0;
996 			et = 510;
997 			ptr = 8;
998 		}
999 	}
1000 
1001 #if SPH_SMALL_FOOTPRINT_SKEIN
1002 
1003 	/*
1004 	 * We use a temporary buffer because we must support the case
1005 	 * where output size is not a multiple of 64 (namely, a 224-bit
1006 	 * output).
1007 	 */
1008 	for (u = 0; u < out_len; u += 8)
1009 		sph_enc64le_aligned(buf + u, h[u >> 3]);
1010 	memcpy(dst, buf, out_len);
1011 
1012 #else
1013 
1014 	sph_enc64le_aligned(buf +  0, h0);
1015 	sph_enc64le_aligned(buf +  8, h1);
1016 	sph_enc64le_aligned(buf + 16, h2);
1017 	sph_enc64le_aligned(buf + 24, h3);
1018 	sph_enc64le_aligned(buf + 32, h4);
1019 	sph_enc64le_aligned(buf + 40, h5);
1020 	sph_enc64le_aligned(buf + 48, h6);
1021 	sph_enc64le_aligned(buf + 56, h7);
1022 	memcpy(dst, buf, out_len);
1023 
1024 #endif
1025 }
1026 
1027 #if 0
1028 /* obsolete */
1029 static const sph_u64 IV224[] = {
1030 	SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C),
1031 	SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE)
1032 };
1033 
1034 static const sph_u64 IV256[] = {
1035 	SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833),
1036 	SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69)
1037 };
1038 #endif
1039 
1040 static const sph_u64 IV224[] = {
1041 	SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF),
1042 	SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4),
1043 	SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D),
1044 	SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7)
1045 };
1046 
1047 static const sph_u64 IV256[] = {
1048 	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
1049 	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
1050 	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
1051 	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
1052 };
1053 
1054 static const sph_u64 IV384[] = {
1055 	SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4),
1056 	SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA),
1057 	SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407),
1058 	SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8)
1059 };
1060 
1061 static const sph_u64 IV512[] = {
1062 	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
1063 	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
1064 	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
1065 	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
1066 };
1067 
1068 #if 0
1069 /* obsolete */
1070 /* see sph_skein.h */
1071 void
1072 sph_skein224_init(void *cc)
1073 {
1074 	skein_small_init(cc, IV224);
1075 }
1076 
1077 /* see sph_skein.h */
1078 void
1079 sph_skein224(void *cc, const void *data, size_t len)
1080 {
1081 	skein_small_core(cc, data, len);
1082 }
1083 
1084 /* see sph_skein.h */
1085 void
1086 sph_skein224_close(void *cc, void *dst)
1087 {
1088 	sph_skein224_addbits_and_close(cc, 0, 0, dst);
1089 }
1090 
1091 /* see sph_skein.h */
1092 void
1093 sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1094 {
1095 	skein_small_close(cc, ub, n, dst, 28);
1096 	sph_skein224_init(cc);
1097 }
1098 
1099 /* see sph_skein.h */
1100 void
1101 sph_skein256_init(void *cc)
1102 {
1103 	skein_small_init(cc, IV256);
1104 }
1105 
1106 /* see sph_skein.h */
1107 void
1108 sph_skein256(void *cc, const void *data, size_t len)
1109 {
1110 	skein_small_core(cc, data, len);
1111 }
1112 
1113 /* see sph_skein.h */
1114 void
1115 sph_skein256_close(void *cc, void *dst)
1116 {
1117 	sph_skein256_addbits_and_close(cc, 0, 0, dst);
1118 }
1119 
1120 /* see sph_skein.h */
1121 void
1122 sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1123 {
1124 	skein_small_close(cc, ub, n, dst, 32);
1125 	sph_skein256_init(cc);
1126 }
1127 #endif
1128 
1129 /* see sph_skein.h */
1130 void
sph_skein224_init(void * cc)1131 sph_skein224_init(void *cc)
1132 {
1133 	skein_big_init(cc, IV224);
1134 }
1135 
1136 /* see sph_skein.h */
1137 void
sph_skein224(void * cc,const void * data,size_t len)1138 sph_skein224(void *cc, const void *data, size_t len)
1139 {
1140 	skein_big_core(cc, data, len);
1141 }
1142 
1143 /* see sph_skein.h */
1144 void
sph_skein224_close(void * cc,void * dst)1145 sph_skein224_close(void *cc, void *dst)
1146 {
1147 	sph_skein224_addbits_and_close(cc, 0, 0, dst);
1148 }
1149 
1150 /* see sph_skein.h */
1151 void
sph_skein224_addbits_and_close(void * cc,unsigned ub,unsigned n,void * dst)1152 sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1153 {
1154 	skein_big_close(cc, ub, n, dst, 28);
1155 	sph_skein224_init(cc);
1156 }
1157 
1158 /* see sph_skein.h */
1159 void
sph_skein256_init(void * cc)1160 sph_skein256_init(void *cc)
1161 {
1162 	skein_big_init(cc, IV256);
1163 }
1164 
1165 /* see sph_skein.h */
1166 void
sph_skein256(void * cc,const void * data,size_t len)1167 sph_skein256(void *cc, const void *data, size_t len)
1168 {
1169 	skein_big_core(cc, data, len);
1170 }
1171 
1172 /* see sph_skein.h */
1173 void
sph_skein256_close(void * cc,void * dst)1174 sph_skein256_close(void *cc, void *dst)
1175 {
1176 	sph_skein256_addbits_and_close(cc, 0, 0, dst);
1177 }
1178 
1179 /* see sph_skein.h */
1180 void
sph_skein256_addbits_and_close(void * cc,unsigned ub,unsigned n,void * dst)1181 sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1182 {
1183 	skein_big_close(cc, ub, n, dst, 32);
1184 	sph_skein256_init(cc);
1185 }
1186 
1187 /* see sph_skein.h */
1188 void
sph_skein384_init(void * cc)1189 sph_skein384_init(void *cc)
1190 {
1191 	skein_big_init(cc, IV384);
1192 }
1193 
1194 /* see sph_skein.h */
1195 void
sph_skein384(void * cc,const void * data,size_t len)1196 sph_skein384(void *cc, const void *data, size_t len)
1197 {
1198 	skein_big_core(cc, data, len);
1199 }
1200 
1201 /* see sph_skein.h */
1202 void
sph_skein384_close(void * cc,void * dst)1203 sph_skein384_close(void *cc, void *dst)
1204 {
1205 	sph_skein384_addbits_and_close(cc, 0, 0, dst);
1206 }
1207 
1208 /* see sph_skein.h */
1209 void
sph_skein384_addbits_and_close(void * cc,unsigned ub,unsigned n,void * dst)1210 sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1211 {
1212 	skein_big_close(cc, ub, n, dst, 48);
1213 	sph_skein384_init(cc);
1214 }
1215 
1216 /* see sph_skein.h */
1217 void
sph_skein512_init(void * cc)1218 sph_skein512_init(void *cc)
1219 {
1220 	skein_big_init(cc, IV512);
1221 }
1222 
1223 /* see sph_skein.h */
1224 void
sph_skein512(void * cc,const void * data,size_t len)1225 sph_skein512(void *cc, const void *data, size_t len)
1226 {
1227 	skein_big_core(cc, data, len);
1228 }
1229 
1230 /* see sph_skein.h */
1231 void
sph_skein512_close(void * cc,void * dst)1232 sph_skein512_close(void *cc, void *dst)
1233 {
1234 	sph_skein512_addbits_and_close(cc, 0, 0, dst);
1235 }
1236 
1237 /* see sph_skein.h */
1238 void
sph_skein512_addbits_and_close(void * cc,unsigned ub,unsigned n,void * dst)1239 sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1240 {
1241 	skein_big_close(cc, ub, n, dst, 64);
1242 	sph_skein512_init(cc);
1243 }
1244 
1245 #endif
1246