1 /***********************************************************************
2 **
3 ** Implementation of the Skein block functions.
4 **
5 ** Source code author: Doug Whiting, 2008.
6 **
7 ** This algorithm and source code is released to the public domain.
8 **
9 ** Compile-time switches:
10 **
11 ** SKEIN_USE_ASM -- set bits (256/512/1024) to select which
12 ** versions use ASM code for block processing
13 ** [default: use C for all block sizes]
14 **
15 ************************************************************************/
16
17 #include <string.h>
18 #include <dieharder/skein.h>
19
20 #ifndef SKEIN_USE_ASM
21 #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
22 #endif
23
24 #ifndef SKEIN_LOOP
25 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
26 #endif
27
28 #define BLK_BITS (WCNT*64) /* some useful definitions for code here */
29 #define KW_TWK_BASE (0)
30 #define KW_KEY_BASE (3)
31 #define ks (kw + KW_KEY_BASE)
32 #define ts (kw + KW_TWK_BASE)
33
34 #ifdef SKEIN_DEBUG
35 #define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
36 #else
37 #define DebugSaveTweak(ctx)
38 #endif
39
40
41 /***************************** Skein_512 ******************************/
42 #if !(SKEIN_USE_ASM & 512)
Threefish_512_Process_Blocks64(Threefish_512_Ctxt_t * ctx,const u08b_t * input,void * output,size_t blkCnt)43 void Threefish_512_Process_Blocks64(Threefish_512_Ctxt_t *ctx, const u08b_t *input,
44 void *output, size_t blkCnt) {
45 enum { WCNT = SKEIN_512_STATE_WORDS };
46 #undef RCNT
47 #define RCNT (SKEIN_512_ROUNDS_TOTAL/8)
48
49 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
50 #define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
51 #else
52 #define SKEIN_UNROLL_512 (0)
53 #endif
54
55 #if SKEIN_UNROLL_512
56 #if (RCNT % SKEIN_UNROLL_512)
57 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
58 #endif
59 size_t r;
60 u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
61 #else
62 u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
63 #endif
64 u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */
65 u64b_t w [WCNT]; /* local copy of input block */
66 #ifdef SKEIN_DEBUG
67 const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */
68 Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3;
69 Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7;
70 #endif
71
72 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
73 ts[0] = ctx->T[0];
74 ts[1] = ctx->T[1];
75
76 /* precompute the key schedule for this block */
77 ks[0] = ctx->Key[0];
78 ks[1] = ctx->Key[1];
79 ks[2] = ctx->Key[2];
80 ks[3] = ctx->Key[3];
81 ks[4] = ctx->Key[4];
82 ks[5] = ctx->Key[5];
83 ks[6] = ctx->Key[6];
84 ks[7] = ctx->Key[7];
85 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
86 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
87
88 ts[2] = ts[0] ^ ts[1];
89
90 do {
91 Skein_Get64_LSB_First(w,input,WCNT); /* get input block in little-endian format */
92
93 X0 = w[0] + ks[0]; /* do the first full key injection */
94 X1 = w[1] + ks[1];
95 X2 = w[2] + ks[2];
96 X3 = w[3] + ks[3];
97 X4 = w[4] + ks[4];
98 X5 = w[5] + ks[5] + ts[0];
99 X6 = w[6] + ks[6] + ts[1];
100 X7 = w[7] + ks[7];
101
102 input += SKEIN_512_BLOCK_BYTES;
103
104 Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
105 /* run the rounds */
106 #define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
107 X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
108 X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
109 X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
110 X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
111
112 #if SKEIN_UNROLL_512 == 0
113 #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \
114 Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
115 Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);
116
117 #define I512(R) \
118 X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \
119 X1 += ks[((R)+2) % 9]; \
120 X2 += ks[((R)+3) % 9]; \
121 X3 += ks[((R)+4) % 9]; \
122 X4 += ks[((R)+5) % 9]; \
123 X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \
124 X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \
125 X7 += ks[((R)+8) % 9] + (R)+1; \
126 Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
127 #else /* looping version */
128 #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
129 Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \
130 Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);
131
132 #define I512(R) \
133 X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
134 X1 += ks[r+(R)+1]; \
135 X2 += ks[r+(R)+2]; \
136 X3 += ks[r+(R)+3]; \
137 X4 += ks[r+(R)+4]; \
138 X5 += ks[r+(R)+5] + ts[r+(R)+0]; \
139 X6 += ks[r+(R)+6] + ts[r+(R)+1]; \
140 X7 += ks[r+(R)+7] + r+(R) ; \
141 ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \
142 ts[r + (R)+2] = ts[r+(R)-1]; \
143 Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
144
145 for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */
146 #endif /* end of looped code definitions */
147 {
148 #define R512_8_rounds(R) /* do 8 full rounds */ \
149 R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \
150 R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \
151 R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \
152 R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \
153 I512(2*(R)); \
154 R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \
155 R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \
156 R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \
157 R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \
158 I512(2*(R)+1); /* and key injection */
159
160 R512_8_rounds( 0);
161
162 #define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
163
164 #if R512_Unroll_R( 1)
165 R512_8_rounds( 1);
166 #endif
167 #if R512_Unroll_R( 2)
168 R512_8_rounds( 2);
169 #endif
170 #if R512_Unroll_R( 3)
171 R512_8_rounds( 3);
172 #endif
173 #if R512_Unroll_R( 4)
174 R512_8_rounds( 4);
175 #endif
176 #if R512_Unroll_R( 5)
177 R512_8_rounds( 5);
178 #endif
179 #if R512_Unroll_R( 6)
180 R512_8_rounds( 6);
181 #endif
182 #if R512_Unroll_R( 7)
183 R512_8_rounds( 7);
184 #endif
185 #if R512_Unroll_R( 8)
186 R512_8_rounds( 8);
187 #endif
188 #if R512_Unroll_R( 9)
189 R512_8_rounds( 9);
190 #endif
191 #if R512_Unroll_R(10)
192 R512_8_rounds(10);
193 #endif
194 #if R512_Unroll_R(11)
195 R512_8_rounds(11);
196 #endif
197 #if R512_Unroll_R(12)
198 R512_8_rounds(12);
199 #endif
200 #if R512_Unroll_R(13)
201 R512_8_rounds(13);
202 #endif
203 #if R512_Unroll_R(14)
204 R512_8_rounds(14);
205 #endif
206 #if (SKEIN_UNROLL_512 > 14)
207 #error "need more unrolling in Skein_512_Process_Block"
208 #endif
209 }
210 ((u64b_t *) output)[0] = X0;
211 ((u64b_t *) output)[1] = X1;
212 ((u64b_t *) output)[2] = X2;
213 ((u64b_t *) output)[3] = X3;
214 ((u64b_t *) output)[4] = X4;
215 ((u64b_t *) output)[5] = X5;
216 ((u64b_t *) output)[6] = X6;
217 ((u64b_t *) output)[7] = X7;
218
219 /*
220 * This is a silly fix, perhaps, BUT it shuts up the
221 * compiler warning about doing arithmetic with a void
222 * pointer. I think it will do the same thing the commented
223 * line did, without the warning.
224 */
225 /* output += SKEIN_512_BLOCK_BYTES; */
226 unsigned long long int output_tmp = (unsigned long long int) output;
227 output_tmp += SKEIN_512_BLOCK_BYTES;
228 output = (void *) output_tmp;
229 } while (--blkCnt);
230 }
231
232 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
Skein_512_Process_Block_CodeSize(void)233 size_t Skein_512_Process_Block_CodeSize(void)
234 {
235 return ((u08b_t *) Skein_512_Process_Block_CodeSize) -
236 ((u08b_t *) Skein_512_Process_Block);
237 }
Skein_512_Unroll_Cnt(void)238 uint_t Skein_512_Unroll_Cnt(void)
239 {
240 return SKEIN_UNROLL_512;
241 }
242 #endif
243 #endif
244