1 /*
2 * Copyright (c) 2014-2015 Vincent Hanquez <vincent@snarc.org>
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the author nor the names of his contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <stdint.h>
32 #include <string.h>
33 #include <stdio.h>
34 #include "cryptonite_salsa.h"
35 #include "cryptonite_bitfn.h"
36 #include "cryptonite_align.h"
37
38 static const uint8_t sigma[16] = "expand 32-byte k";
39 static const uint8_t tau[16] = "expand 16-byte k";
40
41 #define QR(a,b,c,d) \
42 b ^= rol32(a+d, 7); \
43 c ^= rol32(b+a, 9); \
44 d ^= rol32(c+b, 13); \
45 a ^= rol32(d+c, 18);
46
47 #define ALIGNED64(PTR) \
48 (((uintptr_t)(const void *)(PTR)) % 8 == 0)
49
50 #define SALSA_CORE_LOOP \
51 for (i = rounds; i > 0; i -= 2) { \
52 QR (x0,x4,x8,x12); \
53 QR (x5,x9,x13,x1); \
54 QR (x10,x14,x2,x6); \
55 QR (x15,x3,x7,x11); \
56 QR (x0,x1,x2,x3); \
57 QR (x5,x6,x7,x4); \
58 QR (x10,x11,x8,x9); \
59 QR (x15,x12,x13,x14); \
60 }
61
salsa_core(int rounds,block * out,const cryptonite_salsa_state * in)62 static void salsa_core(int rounds, block *out, const cryptonite_salsa_state *in)
63 {
64 uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
65 int i;
66
67 x0 = in->d[0]; x1 = in->d[1]; x2 = in->d[2]; x3 = in->d[3];
68 x4 = in->d[4]; x5 = in->d[5]; x6 = in->d[6]; x7 = in->d[7];
69 x8 = in->d[8]; x9 = in->d[9]; x10 = in->d[10]; x11 = in->d[11];
70 x12 = in->d[12]; x13 = in->d[13]; x14 = in->d[14]; x15 = in->d[15];
71
72 SALSA_CORE_LOOP;
73
74 x0 += in->d[0]; x1 += in->d[1]; x2 += in->d[2]; x3 += in->d[3];
75 x4 += in->d[4]; x5 += in->d[5]; x6 += in->d[6]; x7 += in->d[7];
76 x8 += in->d[8]; x9 += in->d[9]; x10 += in->d[10]; x11 += in->d[11];
77 x12 += in->d[12]; x13 += in->d[13]; x14 += in->d[14]; x15 += in->d[15];
78
79 out->d[0] = cpu_to_le32(x0);
80 out->d[1] = cpu_to_le32(x1);
81 out->d[2] = cpu_to_le32(x2);
82 out->d[3] = cpu_to_le32(x3);
83 out->d[4] = cpu_to_le32(x4);
84 out->d[5] = cpu_to_le32(x5);
85 out->d[6] = cpu_to_le32(x6);
86 out->d[7] = cpu_to_le32(x7);
87 out->d[8] = cpu_to_le32(x8);
88 out->d[9] = cpu_to_le32(x9);
89 out->d[10] = cpu_to_le32(x10);
90 out->d[11] = cpu_to_le32(x11);
91 out->d[12] = cpu_to_le32(x12);
92 out->d[13] = cpu_to_le32(x13);
93 out->d[14] = cpu_to_le32(x14);
94 out->d[15] = cpu_to_le32(x15);
95 }
96
cryptonite_salsa_core_xor(int rounds,block * out,block * in)97 void cryptonite_salsa_core_xor(int rounds, block *out, block *in)
98 {
99 uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
100 int i;
101
102 #define LOAD(i) (out->d[i] ^= in->d[i])
103 x0 = LOAD(0); x1 = LOAD(1); x2 = LOAD(2); x3 = LOAD(3);
104 x4 = LOAD(4); x5 = LOAD(5); x6 = LOAD(6); x7 = LOAD(7);
105 x8 = LOAD(8); x9 = LOAD(9); x10 = LOAD(10); x11 = LOAD(11);
106 x12 = LOAD(12); x13 = LOAD(13); x14 = LOAD(14); x15 = LOAD(15);
107 #undef LOAD
108
109 SALSA_CORE_LOOP;
110
111 out->d[0] += x0; out->d[1] += x1; out->d[2] += x2; out->d[3] += x3;
112 out->d[4] += x4; out->d[5] += x5; out->d[6] += x6; out->d[7] += x7;
113 out->d[8] += x8; out->d[9] += x9; out->d[10] += x10; out->d[11] += x11;
114 out->d[12] += x12; out->d[13] += x13; out->d[14] += x14; out->d[15] += x15;
115 }
116
117 /* only 2 valid values for keylen are 256 (32) and 128 (16) */
cryptonite_salsa_init_core(cryptonite_salsa_state * st,uint32_t keylen,const uint8_t * key,uint32_t ivlen,const uint8_t * iv)118 void cryptonite_salsa_init_core(cryptonite_salsa_state *st,
119 uint32_t keylen, const uint8_t *key,
120 uint32_t ivlen, const uint8_t *iv)
121 {
122 const uint8_t *constants = (keylen == 32) ? sigma : tau;
123
124 st->d[0] = load_le32_aligned(constants + 0);
125 st->d[5] = load_le32_aligned(constants + 4);
126 st->d[10] = load_le32_aligned(constants + 8);
127 st->d[15] = load_le32_aligned(constants + 12);
128
129 st->d[1] = load_le32(key + 0);
130 st->d[2] = load_le32(key + 4);
131 st->d[3] = load_le32(key + 8);
132 st->d[4] = load_le32(key + 12);
133 /* we repeat the key on 128 bits */
134 if (keylen == 32)
135 key += 16;
136 st->d[11] = load_le32(key + 0);
137 st->d[12] = load_le32(key + 4);
138 st->d[13] = load_le32(key + 8);
139 st->d[14] = load_le32(key + 12);
140
141 st->d[9] = 0;
142 switch (ivlen) {
143 case 8:
144 st->d[6] = load_le32(iv + 0);
145 st->d[7] = load_le32(iv + 4);
146 st->d[8] = 0;
147 break;
148 case 12:
149 st->d[6] = load_le32(iv + 0);
150 st->d[7] = load_le32(iv + 4);
151 st->d[8] = load_le32(iv + 8);
152 default:
153 return;
154 }
155 }
156
cryptonite_salsa_init(cryptonite_salsa_context * ctx,uint8_t nb_rounds,uint32_t keylen,const uint8_t * key,uint32_t ivlen,const uint8_t * iv)157 void cryptonite_salsa_init(cryptonite_salsa_context *ctx, uint8_t nb_rounds,
158 uint32_t keylen, const uint8_t *key,
159 uint32_t ivlen, const uint8_t *iv)
160 {
161 memset(ctx, 0, sizeof(*ctx));
162 ctx->nb_rounds = nb_rounds;
163 cryptonite_salsa_init_core(&ctx->st, keylen, key, ivlen, iv);
164 }
165
cryptonite_salsa_combine(uint8_t * dst,cryptonite_salsa_context * ctx,const uint8_t * src,uint32_t bytes)166 void cryptonite_salsa_combine(uint8_t *dst, cryptonite_salsa_context *ctx, const uint8_t *src, uint32_t bytes)
167 {
168 block out;
169 cryptonite_salsa_state *st;
170 int i;
171
172 if (!bytes)
173 return;
174
175 /* xor the previous buffer first (if any) */
176 if (ctx->prev_len > 0) {
177 int to_copy = (ctx->prev_len < bytes) ? ctx->prev_len : bytes;
178 for (i = 0; i < to_copy; i++)
179 dst[i] = src[i] ^ ctx->prev[ctx->prev_ofs+i];
180 memset(ctx->prev + ctx->prev_ofs, 0, to_copy);
181 ctx->prev_len -= to_copy;
182 ctx->prev_ofs += to_copy;
183 src += to_copy;
184 dst += to_copy;
185 bytes -= to_copy;
186 }
187
188 if (bytes == 0)
189 return;
190
191 st = &ctx->st;
192
193 /* xor new 64-bytes chunks and store the left over if any */
194 for (; bytes >= 64; bytes -= 64, src += 64, dst += 64) {
195 /* generate new chunk and update state */
196 salsa_core(ctx->nb_rounds, &out, st);
197 st->d[8] += 1;
198 if (st->d[8] == 0)
199 st->d[9] += 1;
200
201 for (i = 0; i < 64; ++i)
202 dst[i] = src[i] ^ out.b[i];
203 }
204
205 if (bytes > 0) {
206 /* generate new chunk and update state */
207 salsa_core(ctx->nb_rounds, &out, st);
208 st->d[8] += 1;
209 if (st->d[8] == 0)
210 st->d[9] += 1;
211
212 /* xor as much as needed */
213 for (i = 0; i < bytes; i++)
214 dst[i] = src[i] ^ out.b[i];
215
216 /* copy the left over in the buffer */
217 ctx->prev_len = 64 - bytes;
218 ctx->prev_ofs = i;
219 for (; i < 64; i++) {
220 ctx->prev[i] = out.b[i];
221 }
222 }
223 }
224
cryptonite_salsa_generate(uint8_t * dst,cryptonite_salsa_context * ctx,uint32_t bytes)225 void cryptonite_salsa_generate(uint8_t *dst, cryptonite_salsa_context *ctx, uint32_t bytes)
226 {
227 cryptonite_salsa_state *st;
228 block out;
229 int i;
230
231 if (!bytes)
232 return;
233
234 /* xor the previous buffer first (if any) */
235 if (ctx->prev_len > 0) {
236 int to_copy = (ctx->prev_len < bytes) ? ctx->prev_len : bytes;
237 for (i = 0; i < to_copy; i++)
238 dst[i] = ctx->prev[ctx->prev_ofs+i];
239 memset(ctx->prev + ctx->prev_ofs, 0, to_copy);
240 ctx->prev_len -= to_copy;
241 ctx->prev_ofs += to_copy;
242 dst += to_copy;
243 bytes -= to_copy;
244 }
245
246 if (bytes == 0)
247 return;
248
249 st = &ctx->st;
250
251 if (ALIGNED64(dst)) {
252 /* xor new 64-bytes chunks and store the left over if any */
253 for (; bytes >= 64; bytes -= 64, dst += 64) {
254 /* generate new chunk and update state */
255 salsa_core(ctx->nb_rounds, (block *) dst, st);
256 st->d[8] += 1;
257 if (st->d[8] == 0)
258 st->d[9] += 1;
259 }
260 } else {
261 /* xor new 64-bytes chunks and store the left over if any */
262 for (; bytes >= 64; bytes -= 64, dst += 64) {
263 /* generate new chunk and update state */
264 salsa_core(ctx->nb_rounds, &out, st);
265 st->d[8] += 1;
266 if (st->d[8] == 0)
267 st->d[9] += 1;
268
269 for (i = 0; i < 64; ++i)
270 dst[i] = out.b[i];
271 }
272 }
273
274 if (bytes > 0) {
275 /* generate new chunk and update state */
276 salsa_core(ctx->nb_rounds, &out, st);
277 st->d[8] += 1;
278 if (st->d[8] == 0)
279 st->d[9] += 1;
280
281 /* xor as much as needed */
282 for (i = 0; i < bytes; i++)
283 dst[i] = out.b[i];
284
285 /* copy the left over in the buffer */
286 ctx->prev_len = 64 - bytes;
287 ctx->prev_ofs = i;
288 for (; i < 64; i++)
289 ctx->prev[i] = out.b[i];
290 }
291 }
292
293