1 // Monocypher version 3.1.2
2 //
3 // This file is dual-licensed. Choose whichever licence you want from
4 // the two licences listed below.
5 //
6 // The first licence is a regular 2-clause BSD licence. The second licence
7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
8 // to the public domain. The BSD licence serves as a fallback option.
9 //
10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
11 //
12 // ------------------------------------------------------------------------
13 //
14 // Copyright (c) 2017-2020, Loup Vaillant
15 // All rights reserved.
16 //
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // 1. Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // 2. Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the
28 // distribution.
29 //
30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 //
42 // ------------------------------------------------------------------------
43 //
44 // Written in 2017-2020 by Loup Vaillant
45 //
46 // To the extent possible under law, the author(s) have dedicated all copyright
47 // and related neighboring rights to this software to the public domain
48 // worldwide. This software is distributed without any warranty.
49 //
50 // You should have received a copy of the CC0 Public Domain Dedication along
51 // with this software. If not, see
52 // <https://creativecommons.org/publicdomain/zero/1.0/>
53
54 #include "monocypher.h"
55
56 /////////////////
57 /// Utilities ///
58 /////////////////
59 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
60 #define FOR(i, start, end) FOR_T(size_t, i, start, end)
61 #define COPY(dst, src, size) FOR(i, 0, size) (dst)[i] = (src)[i]
62 #define ZERO(buf, size) FOR(i, 0, size) (buf)[i] = 0
63 #define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx)))
64 #define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer))
65 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
66 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
67
68 typedef int8_t i8;
69 typedef uint8_t u8;
70 typedef int16_t i16;
71 typedef uint32_t u32;
72 typedef int32_t i32;
73 typedef int64_t i64;
74 typedef uint64_t u64;
75
76 static const u8 zero[128] = {0};
77
78 // returns the smallest positive integer y such that
79 // (x + y) % pow_2 == 0
80 // Basically, it's how many bytes we need to add to "align" x.
81 // Only works when pow_2 is a power of 2.
82 // Note: we use ~x+1 instead of -x to avoid compiler warnings
align(size_t x,size_t pow_2)83 static size_t align(size_t x, size_t pow_2)
84 {
85 return (~x + 1) & (pow_2 - 1);
86 }
87
load24_le(const u8 s[3])88 static u32 load24_le(const u8 s[3])
89 {
90 return (u32)s[0]
91 | ((u32)s[1] << 8)
92 | ((u32)s[2] << 16);
93 }
94
load32_le(const u8 s[4])95 static u32 load32_le(const u8 s[4])
96 {
97 return (u32)s[0]
98 | ((u32)s[1] << 8)
99 | ((u32)s[2] << 16)
100 | ((u32)s[3] << 24);
101 }
102
load64_le(const u8 s[8])103 static u64 load64_le(const u8 s[8])
104 {
105 return load32_le(s) | ((u64)load32_le(s+4) << 32);
106 }
107
store32_le(u8 out[4],u32 in)108 static void store32_le(u8 out[4], u32 in)
109 {
110 out[0] = in & 0xff;
111 out[1] = (in >> 8) & 0xff;
112 out[2] = (in >> 16) & 0xff;
113 out[3] = (in >> 24) & 0xff;
114 }
115
store64_le(u8 out[8],u64 in)116 static void store64_le(u8 out[8], u64 in)
117 {
118 store32_le(out , (u32)in );
119 store32_le(out + 4, in >> 32);
120 }
121
load32_le_buf(u32 * dst,const u8 * src,size_t size)122 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
123 FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
124 }
load64_le_buf(u64 * dst,const u8 * src,size_t size)125 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
126 FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
127 }
store32_le_buf(u8 * dst,const u32 * src,size_t size)128 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
129 FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
130 }
store64_le_buf(u8 * dst,const u64 * src,size_t size)131 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
132 FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
133 }
134
rotr64(u64 x,u64 n)135 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
rotl32(u32 x,u32 n)136 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
137
neq0(u64 diff)138 static int neq0(u64 diff)
139 { // constant time comparison to zero
140 // return diff != 0 ? -1 : 0
141 u64 half = (diff >> 32) | ((u32)diff);
142 return (1 & ((half - 1) >> 32)) - 1;
143 }
144
x16(const u8 a[16],const u8 b[16])145 static u64 x16(const u8 a[16], const u8 b[16])
146 {
147 return (load64_le(a + 0) ^ load64_le(b + 0))
148 | (load64_le(a + 8) ^ load64_le(b + 8));
149 }
x32(const u8 a[32],const u8 b[32])150 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
x64(const u8 a[64],const u8 b[64])151 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
crypto_verify16(const u8 a[16],const u8 b[16])152 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
crypto_verify32(const u8 a[32],const u8 b[32])153 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
crypto_verify64(const u8 a[64],const u8 b[64])154 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
155
crypto_wipe(void * secret,size_t size)156 void crypto_wipe(void *secret, size_t size)
157 {
158 volatile u8 *v_secret = (u8*)secret;
159 ZERO(v_secret, size);
160 }
161
162 /////////////////
163 /// Chacha 20 ///
164 /////////////////
165 #define QUARTERROUND(a, b, c, d) \
166 a += b; d = rotl32(d ^ a, 16); \
167 c += d; b = rotl32(b ^ c, 12); \
168 a += b; d = rotl32(d ^ a, 8); \
169 c += d; b = rotl32(b ^ c, 7)
170
chacha20_rounds(u32 out[16],const u32 in[16])171 static void chacha20_rounds(u32 out[16], const u32 in[16])
172 {
173 // The temporary variables make Chacha20 10% faster.
174 u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3];
175 u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7];
176 u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11];
177 u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15];
178
179 FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
180 QUARTERROUND(t0, t4, t8 , t12); // column 0
181 QUARTERROUND(t1, t5, t9 , t13); // column 1
182 QUARTERROUND(t2, t6, t10, t14); // column 2
183 QUARTERROUND(t3, t7, t11, t15); // column 3
184 QUARTERROUND(t0, t5, t10, t15); // diagonal 0
185 QUARTERROUND(t1, t6, t11, t12); // diagonal 1
186 QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
187 QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
188 }
189 out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3;
190 out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7;
191 out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11;
192 out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15;
193 }
194
chacha20_init_key(u32 block[16],const u8 key[32])195 static void chacha20_init_key(u32 block[16], const u8 key[32])
196 {
197 load32_le_buf(block , (const u8*)"expand 32-byte k", 4); // constant
198 load32_le_buf(block+4, key , 8); // key
199 }
200
crypto_hchacha20(u8 out[32],const u8 key[32],const u8 in[16])201 void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
202 {
203 u32 block[16];
204 chacha20_init_key(block, key);
205 // input
206 load32_le_buf(block + 12, in, 4);
207 chacha20_rounds(block, block);
208 // prevent reversal of the rounds by revealing only half of the buffer.
209 store32_le_buf(out , block , 4); // constant
210 store32_le_buf(out+16, block+12, 4); // counter and nonce
211 WIPE_BUFFER(block);
212 }
213
crypto_chacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[8],u64 ctr)214 u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
215 size_t text_size, const u8 key[32], const u8 nonce[8],
216 u64 ctr)
217 {
218 u32 input[16];
219 chacha20_init_key(input, key);
220 input[12] = (u32) ctr;
221 input[13] = (u32)(ctr >> 32);
222 load32_le_buf(input+14, nonce, 2);
223
224 // Whole blocks
225 u32 pool[16];
226 size_t nb_blocks = text_size >> 6;
227 FOR (i, 0, nb_blocks) {
228 chacha20_rounds(pool, input);
229 if (plain_text != 0) {
230 FOR (j, 0, 16) {
231 u32 p = pool[j] + input[j];
232 store32_le(cipher_text, p ^ load32_le(plain_text));
233 cipher_text += 4;
234 plain_text += 4;
235 }
236 } else {
237 FOR (j, 0, 16) {
238 u32 p = pool[j] + input[j];
239 store32_le(cipher_text, p);
240 cipher_text += 4;
241 }
242 }
243 input[12]++;
244 if (input[12] == 0) {
245 input[13]++;
246 }
247 }
248 text_size &= 63;
249
250 // Last (incomplete) block
251 if (text_size > 0) {
252 if (plain_text == 0) {
253 plain_text = zero;
254 }
255 chacha20_rounds(pool, input);
256 u8 tmp[64];
257 FOR (i, 0, 16) {
258 store32_le(tmp + i*4, pool[i] + input[i]);
259 }
260 FOR (i, 0, text_size) {
261 cipher_text[i] = tmp[i] ^ plain_text[i];
262 }
263 WIPE_BUFFER(tmp);
264 }
265 ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
266
267 WIPE_BUFFER(pool);
268 WIPE_BUFFER(input);
269 return ctr;
270 }
271
crypto_ietf_chacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[12],u32 ctr)272 u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
273 size_t text_size,
274 const u8 key[32], const u8 nonce[12], u32 ctr)
275 {
276 u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
277 return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size,
278 key, nonce + 4, big_ctr);
279 }
280
crypto_xchacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[24],u64 ctr)281 u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
282 size_t text_size,
283 const u8 key[32], const u8 nonce[24], u64 ctr)
284 {
285 u8 sub_key[32];
286 crypto_hchacha20(sub_key, key, nonce);
287 ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
288 sub_key, nonce+16, ctr);
289 WIPE_BUFFER(sub_key);
290 return ctr;
291 }
292
crypto_chacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[8])293 void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
294 const u8 key[32], const u8 nonce[8])
295 {
296 crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
297
298 }
crypto_ietf_chacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[12])299 void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
300 size_t text_size,
301 const u8 key[32], const u8 nonce[12])
302 {
303 crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
304 }
305
crypto_xchacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[24])306 void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
307 const u8 key[32], const u8 nonce[24])
308 {
309 crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
310 }
311
312 /////////////////
313 /// Poly 1305 ///
314 /////////////////
315
316 // h = (h + c) * r
317 // preconditions:
318 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
319 // ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
320 // ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
321 // Postcondition:
322 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
poly_block(crypto_poly1305_ctx * ctx)323 static void poly_block(crypto_poly1305_ctx *ctx)
324 {
325 // s = h + c, without carry propagation
326 const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
327 const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
328 const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
329 const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
330 const u32 s4 = ctx->h[4] + ctx->c[4]; // s4 <= 5
331
332 // Local all the things!
333 const u32 r0 = ctx->r[0]; // r0 <= 0fffffff
334 const u32 r1 = ctx->r[1]; // r1 <= 0ffffffc
335 const u32 r2 = ctx->r[2]; // r2 <= 0ffffffc
336 const u32 r3 = ctx->r[3]; // r3 <= 0ffffffc
337 const u32 rr0 = (r0 >> 2) * 5; // rr0 <= 13fffffb // lose 2 bits...
338 const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
339 const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
340 const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
341
342 // (h + c) * r, without carry propagation
343 const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
344 const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
345 const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
346 const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
347 const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits // <= f
348
349 // partial reduction modulo 2^130 - 5
350 const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
351 const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff);
352 const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
353 const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
354 const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
355 const u64 u4 = (u3 >> 32) + (u5 & 3);
356
357 // Update the hash
358 ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
359 ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
360 ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
361 ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
362 ctx->h[4] = (u32)u4; // u4 <= 4
363 }
364
365 // (re-)initialises the input counter and input buffer
poly_clear_c(crypto_poly1305_ctx * ctx)366 static void poly_clear_c(crypto_poly1305_ctx *ctx)
367 {
368 ZERO(ctx->c, 4);
369 ctx->c_idx = 0;
370 }
371
poly_take_input(crypto_poly1305_ctx * ctx,u8 input)372 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
373 {
374 size_t word = ctx->c_idx >> 2;
375 size_t byte = ctx->c_idx & 3;
376 ctx->c[word] |= (u32)input << (byte * 8);
377 ctx->c_idx++;
378 }
379
poly_update(crypto_poly1305_ctx * ctx,const u8 * message,size_t message_size)380 static void poly_update(crypto_poly1305_ctx *ctx,
381 const u8 *message, size_t message_size)
382 {
383 FOR (i, 0, message_size) {
384 poly_take_input(ctx, message[i]);
385 if (ctx->c_idx == 16) {
386 poly_block(ctx);
387 poly_clear_c(ctx);
388 }
389 }
390 }
391
crypto_poly1305_init(crypto_poly1305_ctx * ctx,const u8 key[32])392 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
393 {
394 // Initial hash is zero
395 ZERO(ctx->h, 5);
396 // add 2^130 to every input block
397 ctx->c[4] = 1;
398 poly_clear_c(ctx);
399 // load r and pad (r has some of its bits cleared)
400 load32_le_buf(ctx->r , key , 4);
401 load32_le_buf(ctx->pad, key+16, 4);
402 FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
403 FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
404 }
405
crypto_poly1305_update(crypto_poly1305_ctx * ctx,const u8 * message,size_t message_size)406 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
407 const u8 *message, size_t message_size)
408 {
409 if (message_size == 0) {
410 return;
411 }
412 // Align ourselves with block boundaries
413 size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
414 poly_update(ctx, message, aligned);
415 message += aligned;
416 message_size -= aligned;
417
418 // Process the message block by block
419 size_t nb_blocks = message_size >> 4;
420 FOR (i, 0, nb_blocks) {
421 load32_le_buf(ctx->c, message, 4);
422 poly_block(ctx);
423 message += 16;
424 }
425 if (nb_blocks > 0) {
426 poly_clear_c(ctx);
427 }
428 message_size &= 15;
429
430 // remaining bytes
431 poly_update(ctx, message, message_size);
432 }
433
crypto_poly1305_final(crypto_poly1305_ctx * ctx,u8 mac[16])434 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
435 {
436 // Process the last block (if any)
437 if (ctx->c_idx != 0) {
438 // move the final 1 according to remaining input length
439 // (We may add less than 2^130 to the last input block)
440 ctx->c[4] = 0;
441 poly_take_input(ctx, 1);
442 // one last hash update
443 poly_block(ctx);
444 }
445
446 // check if we should subtract 2^130-5 by performing the
447 // corresponding carry propagation.
448 u64 c = 5;
449 FOR (i, 0, 4) {
450 c += ctx->h[i];
451 c >>= 32;
452 }
453 c += ctx->h[4];
454 c = (c >> 2) * 5; // shift the carry back to the beginning
455 // c now indicates how many times we should subtract 2^130-5 (0 or 1)
456 FOR (i, 0, 4) {
457 c += (u64)ctx->h[i] + ctx->pad[i];
458 store32_le(mac + i*4, (u32)c);
459 c = c >> 32;
460 }
461 WIPE_CTX(ctx);
462 }
463
crypto_poly1305(u8 mac[16],const u8 * message,size_t message_size,const u8 key[32])464 void crypto_poly1305(u8 mac[16], const u8 *message,
465 size_t message_size, const u8 key[32])
466 {
467 crypto_poly1305_ctx ctx;
468 crypto_poly1305_init (&ctx, key);
469 crypto_poly1305_update(&ctx, message, message_size);
470 crypto_poly1305_final (&ctx, mac);
471 }
472
473 ////////////////
474 /// Blake2 b ///
475 ////////////////
476 static const u64 iv[8] = {
477 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
478 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
479 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
480 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
481 };
482
483 // increment the input offset
blake2b_incr(crypto_blake2b_ctx * ctx)484 static void blake2b_incr(crypto_blake2b_ctx *ctx)
485 {
486 u64 *x = ctx->input_offset;
487 size_t y = ctx->input_idx;
488 x[0] += y;
489 if (x[0] < y) {
490 x[1]++;
491 }
492 }
493
blake2b_compress(crypto_blake2b_ctx * ctx,int is_last_block)494 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
495 {
496 static const u8 sigma[12][16] = {
497 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
498 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
499 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
500 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
501 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
502 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
503 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
504 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
505 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
506 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
507 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
508 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
509 };
510
511 // init work vector
512 u64 v0 = ctx->hash[0]; u64 v8 = iv[0];
513 u64 v1 = ctx->hash[1]; u64 v9 = iv[1];
514 u64 v2 = ctx->hash[2]; u64 v10 = iv[2];
515 u64 v3 = ctx->hash[3]; u64 v11 = iv[3];
516 u64 v4 = ctx->hash[4]; u64 v12 = iv[4] ^ ctx->input_offset[0];
517 u64 v5 = ctx->hash[5]; u64 v13 = iv[5] ^ ctx->input_offset[1];
518 u64 v6 = ctx->hash[6]; u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
519 u64 v7 = ctx->hash[7]; u64 v15 = iv[7];
520
521 // mangle work vector
522 u64 *input = ctx->input;
523 #define BLAKE2_G(a, b, c, d, x, y) \
524 a += b + x; d = rotr64(d ^ a, 32); \
525 c += d; b = rotr64(b ^ c, 24); \
526 a += b + y; d = rotr64(d ^ a, 16); \
527 c += d; b = rotr64(b ^ c, 63)
528 #define BLAKE2_ROUND(i) \
529 BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
530 BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
531 BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
532 BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
533 BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
534 BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
535 BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
536 BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
537
538 #ifdef BLAKE2_NO_UNROLLING
539 FOR (i, 0, 12) {
540 BLAKE2_ROUND(i);
541 }
542 #else
543 BLAKE2_ROUND(0); BLAKE2_ROUND(1); BLAKE2_ROUND(2); BLAKE2_ROUND(3);
544 BLAKE2_ROUND(4); BLAKE2_ROUND(5); BLAKE2_ROUND(6); BLAKE2_ROUND(7);
545 BLAKE2_ROUND(8); BLAKE2_ROUND(9); BLAKE2_ROUND(10); BLAKE2_ROUND(11);
546 #endif
547
548 // update hash
549 ctx->hash[0] ^= v0 ^ v8; ctx->hash[1] ^= v1 ^ v9;
550 ctx->hash[2] ^= v2 ^ v10; ctx->hash[3] ^= v3 ^ v11;
551 ctx->hash[4] ^= v4 ^ v12; ctx->hash[5] ^= v5 ^ v13;
552 ctx->hash[6] ^= v6 ^ v14; ctx->hash[7] ^= v7 ^ v15;
553 }
554
blake2b_set_input(crypto_blake2b_ctx * ctx,u8 input,size_t index)555 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
556 {
557 if (index == 0) {
558 ZERO(ctx->input, 16);
559 }
560 size_t word = index >> 3;
561 size_t byte = index & 7;
562 ctx->input[word] |= (u64)input << (byte << 3);
563
564 }
565
blake2b_end_block(crypto_blake2b_ctx * ctx)566 static void blake2b_end_block(crypto_blake2b_ctx *ctx)
567 {
568 if (ctx->input_idx == 128) { // If buffer is full,
569 blake2b_incr(ctx); // update the input offset
570 blake2b_compress(ctx, 0); // and compress the (not last) block
571 ctx->input_idx = 0;
572 }
573 }
574
blake2b_update(crypto_blake2b_ctx * ctx,const u8 * message,size_t message_size)575 static void blake2b_update(crypto_blake2b_ctx *ctx,
576 const u8 *message, size_t message_size)
577 {
578 FOR (i, 0, message_size) {
579 blake2b_end_block(ctx);
580 blake2b_set_input(ctx, message[i], ctx->input_idx);
581 ctx->input_idx++;
582 }
583 }
584
crypto_blake2b_general_init(crypto_blake2b_ctx * ctx,size_t hash_size,const u8 * key,size_t key_size)585 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
586 const u8 *key, size_t key_size)
587 {
588 // initial hash
589 COPY(ctx->hash, iv, 8);
590 ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
591
592 ctx->input_offset[0] = 0; // beginning of the input, no offset
593 ctx->input_offset[1] = 0; // beginning of the input, no offset
594 ctx->hash_size = hash_size; // remember the hash size we want
595 ctx->input_idx = 0;
596
597 // if there is a key, the first block is that key (padded with zeroes)
598 if (key_size > 0) {
599 u8 key_block[128] = {0};
600 COPY(key_block, key, key_size);
601 // same as calling crypto_blake2b_update(ctx, key_block , 128)
602 load64_le_buf(ctx->input, key_block, 16);
603 ctx->input_idx = 128;
604 }
605 }
606
crypto_blake2b_init(crypto_blake2b_ctx * ctx)607 void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
608 {
609 crypto_blake2b_general_init(ctx, 64, 0, 0);
610 }
611
crypto_blake2b_update(crypto_blake2b_ctx * ctx,const u8 * message,size_t message_size)612 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
613 const u8 *message, size_t message_size)
614 {
615 if (message_size == 0) {
616 return;
617 }
618 // Align ourselves with block boundaries
619 size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
620 blake2b_update(ctx, message, aligned);
621 message += aligned;
622 message_size -= aligned;
623
624 // Process the message block by block
625 FOR (i, 0, message_size >> 7) { // number of blocks
626 blake2b_end_block(ctx);
627 load64_le_buf(ctx->input, message, 16);
628 message += 128;
629 ctx->input_idx = 128;
630 }
631 message_size &= 127;
632
633 // remaining bytes
634 blake2b_update(ctx, message, message_size);
635 }
636
crypto_blake2b_final(crypto_blake2b_ctx * ctx,u8 * hash)637 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
638 {
639 // Pad the end of the block with zeroes
640 FOR (i, ctx->input_idx, 128) {
641 blake2b_set_input(ctx, 0, i);
642 }
643 blake2b_incr(ctx); // update the input offset
644 blake2b_compress(ctx, 1); // compress the last block
645 size_t nb_words = ctx->hash_size >> 3;
646 store64_le_buf(hash, ctx->hash, nb_words);
647 FOR (i, nb_words << 3, ctx->hash_size) {
648 hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
649 }
650 WIPE_CTX(ctx);
651 }
652
crypto_blake2b_general(u8 * hash,size_t hash_size,const u8 * key,size_t key_size,const u8 * message,size_t message_size)653 void crypto_blake2b_general(u8 *hash , size_t hash_size,
654 const u8 *key , size_t key_size,
655 const u8 *message, size_t message_size)
656 {
657 crypto_blake2b_ctx ctx;
658 crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
659 crypto_blake2b_update(&ctx, message, message_size);
660 crypto_blake2b_final(&ctx, hash);
661 }
662
crypto_blake2b(u8 hash[64],const u8 * message,size_t message_size)663 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
664 {
665 crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
666 }
667
blake2b_vtable_init(void * ctx)668 static void blake2b_vtable_init(void *ctx) {
669 crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
670 }
blake2b_vtable_update(void * ctx,const u8 * m,size_t s)671 static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) {
672 crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
673 }
blake2b_vtable_final(void * ctx,u8 * h)674 static void blake2b_vtable_final(void *ctx, u8 *h) {
675 crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
676 }
677 const crypto_sign_vtable crypto_blake2b_vtable = {
678 crypto_blake2b,
679 blake2b_vtable_init,
680 blake2b_vtable_update,
681 blake2b_vtable_final,
682 sizeof(crypto_sign_ctx),
683 };
684
685 ////////////////
686 /// Argon2 i ///
687 ////////////////
688 // references to R, Z, Q etc. come from the spec
689
690 // Argon2 operates on 1024 byte blocks.
691 typedef struct { u64 a[128]; } block;
692
wipe_block(block * b)693 static void wipe_block(block *b)
694 {
695 volatile u64* a = b->a;
696 ZERO(a, 128);
697 }
698
699 // updates a Blake2 hash with a 32 bit word, little endian.
blake_update_32(crypto_blake2b_ctx * ctx,u32 input)700 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
701 {
702 u8 buf[4];
703 store32_le(buf, input);
704 crypto_blake2b_update(ctx, buf, 4);
705 WIPE_BUFFER(buf);
706 }
707
load_block(block * b,const u8 bytes[1024])708 static void load_block(block *b, const u8 bytes[1024])
709 {
710 load64_le_buf(b->a, bytes, 128);
711 }
712
store_block(u8 bytes[1024],const block * b)713 static void store_block(u8 bytes[1024], const block *b)
714 {
715 store64_le_buf(bytes, b->a, 128);
716 }
717
copy_block(block * o,const block * in)718 static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
xor_block(block * o,const block * in)719 static void xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
720
721 // Hash with a virtually unlimited digest size.
722 // Doesn't extract more entropy than the base hash function.
723 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
724 // (One could use a stream cipher with a seed hash as the key, but
725 // this would introduce another dependency —and point of failure.)
extended_hash(u8 * digest,u32 digest_size,const u8 * input,u32 input_size)726 static void extended_hash(u8 *digest, u32 digest_size,
727 const u8 *input , u32 input_size)
728 {
729 crypto_blake2b_ctx ctx;
730 crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0);
731 blake_update_32 (&ctx, digest_size);
732 crypto_blake2b_update (&ctx, input, input_size);
733 crypto_blake2b_final (&ctx, digest);
734
735 if (digest_size > 64) {
736 // the conversion to u64 avoids integer overflow on
737 // ludicrously big hash sizes.
738 u32 r = (u32)(((u64)digest_size + 31) >> 5) - 2;
739 u32 i = 1;
740 u32 in = 0;
741 u32 out = 32;
742 while (i < r) {
743 // Input and output overlap. This is intentional
744 crypto_blake2b(digest + out, digest + in, 64);
745 i += 1;
746 in += 32;
747 out += 32;
748 }
749 crypto_blake2b_general(digest + out, digest_size - (32 * r),
750 0, 0, // no key
751 digest + in , 64);
752 }
753 }
754
755 #define LSB(x) ((x) & 0xffffffff)
756 #define G(a, b, c, d) \
757 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 32); \
758 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 24); \
759 a += b + 2 * LSB(a) * LSB(b); d ^= a; d = rotr64(d, 16); \
760 c += d + 2 * LSB(c) * LSB(d); b ^= c; b = rotr64(b, 63)
761 #define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \
762 v8, v9, v10, v11, v12, v13, v14, v15) \
763 G(v0, v4, v8, v12); G(v1, v5, v9, v13); \
764 G(v2, v6, v10, v14); G(v3, v7, v11, v15); \
765 G(v0, v5, v10, v15); G(v1, v6, v11, v12); \
766 G(v2, v7, v8, v13); G(v3, v4, v9, v14)
767
768 // Core of the compression function G. Computes Z from R in place.
g_rounds(block * work_block)769 static void g_rounds(block *work_block)
770 {
771 // column rounds (work_block = Q)
772 for (int i = 0; i < 128; i += 16) {
773 ROUND(work_block->a[i ], work_block->a[i + 1],
774 work_block->a[i + 2], work_block->a[i + 3],
775 work_block->a[i + 4], work_block->a[i + 5],
776 work_block->a[i + 6], work_block->a[i + 7],
777 work_block->a[i + 8], work_block->a[i + 9],
778 work_block->a[i + 10], work_block->a[i + 11],
779 work_block->a[i + 12], work_block->a[i + 13],
780 work_block->a[i + 14], work_block->a[i + 15]);
781 }
782 // row rounds (work_block = Z)
783 for (int i = 0; i < 16; i += 2) {
784 ROUND(work_block->a[i ], work_block->a[i + 1],
785 work_block->a[i + 16], work_block->a[i + 17],
786 work_block->a[i + 32], work_block->a[i + 33],
787 work_block->a[i + 48], work_block->a[i + 49],
788 work_block->a[i + 64], work_block->a[i + 65],
789 work_block->a[i + 80], work_block->a[i + 81],
790 work_block->a[i + 96], work_block->a[i + 97],
791 work_block->a[i + 112], work_block->a[i + 113]);
792 }
793 }
794
795 // The compression function G (copy version for the first pass)
g_copy(block * result,const block * x,const block * y,block * tmp)796 static void g_copy(block *result, const block *x, const block *y, block* tmp)
797 {
798 copy_block(tmp , x ); // tmp = X
799 xor_block (tmp , y ); // tmp = X ^ Y = R
800 copy_block(result, tmp); // result = R (only difference with g_xor)
801 g_rounds (tmp); // tmp = Z
802 xor_block (result, tmp); // result = R ^ Z
803 }
804
805 // The compression function G (xor version for subsequent passes)
g_xor(block * result,const block * x,const block * y,block * tmp)806 static void g_xor(block *result, const block *x, const block *y, block *tmp)
807 {
808 copy_block(tmp , x ); // tmp = X
809 xor_block (tmp , y ); // tmp = X ^ Y = R
810 xor_block (result, tmp); // result = R ^ old (only difference with g_copy)
811 g_rounds (tmp); // tmp = Z
812 xor_block (result, tmp); // result = R ^ old ^ Z
813 }
814
815 // Unary version of the compression function.
816 // The missing argument is implied zero.
817 // Does the transformation in place.
unary_g(block * work_block,block * tmp)818 static void unary_g(block *work_block, block *tmp)
819 {
820 // work_block == R
821 copy_block(tmp, work_block); // tmp = R
822 g_rounds (work_block); // work_block = Z
823 xor_block (work_block, tmp); // work_block = Z ^ R
824 }
825
826 // Argon2i uses a kind of stream cipher to determine which reference
827 // block it will take to synthesise the next block. This context hold
828 // that stream's state. (It's very similar to Chacha20. The block b
829 // is analogous to Chacha's own pool)
830 typedef struct {
831 block b;
832 u32 pass_number;
833 u32 slice_number;
834 u32 nb_blocks;
835 u32 nb_iterations;
836 u32 ctr;
837 u32 offset;
838 } gidx_ctx;
839
840 // The block in the context will determine array indices. To avoid
841 // timing attacks, it only depends on public information. No looking
842 // at a previous block to seed the next. This makes offline attacks
843 // easier, but timing attacks are the bigger threat in many settings.
gidx_refresh(gidx_ctx * ctx)844 static void gidx_refresh(gidx_ctx *ctx)
845 {
846 // seed the beginning of the block...
847 ctx->b.a[0] = ctx->pass_number;
848 ctx->b.a[1] = 0; // lane number (we have only one)
849 ctx->b.a[2] = ctx->slice_number;
850 ctx->b.a[3] = ctx->nb_blocks;
851 ctx->b.a[4] = ctx->nb_iterations;
852 ctx->b.a[5] = 1; // type: Argon2i
853 ctx->b.a[6] = ctx->ctr;
854 ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
855
856 // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
857 // (G "square" function), to get cheap pseudo-random numbers.
858 block tmp;
859 unary_g(&ctx->b, &tmp);
860 unary_g(&ctx->b, &tmp);
861 wipe_block(&tmp);
862 }
863
gidx_init(gidx_ctx * ctx,u32 pass_number,u32 slice_number,u32 nb_blocks,u32 nb_iterations)864 static void gidx_init(gidx_ctx *ctx,
865 u32 pass_number, u32 slice_number,
866 u32 nb_blocks, u32 nb_iterations)
867 {
868 ctx->pass_number = pass_number;
869 ctx->slice_number = slice_number;
870 ctx->nb_blocks = nb_blocks;
871 ctx->nb_iterations = nb_iterations;
872 ctx->ctr = 0;
873
874 // Offset from the beginning of the segment. For the first slice
875 // of the first pass, we start at the *third* block, so the offset
876 // starts at 2, not 0.
877 if (pass_number != 0 || slice_number != 0) {
878 ctx->offset = 0;
879 } else {
880 ctx->offset = 2;
881 ctx->ctr++; // Compensates for missed lazy creation
882 gidx_refresh(ctx); // at the start of gidx_next()
883 }
884 }
885
gidx_next(gidx_ctx * ctx)886 static u32 gidx_next(gidx_ctx *ctx)
887 {
888 // lazily creates the offset block we need
889 if ((ctx->offset & 127) == 0) {
890 ctx->ctr++;
891 gidx_refresh(ctx);
892 }
893 u32 index = ctx->offset & 127; // save index for current call
894 u32 offset = ctx->offset; // save offset for current call
895 ctx->offset++; // update offset for next call
896
897 // Computes the area size.
898 // Pass 0 : all already finished segments plus already constructed
899 // blocks in this segment
900 // Pass 1+: 3 last segments plus already constructed
901 // blocks in this segment. THE SPEC SUGGESTS OTHERWISE.
902 // I CONFORM TO THE REFERENCE IMPLEMENTATION.
903 int first_pass = ctx->pass_number == 0;
904 u32 slice_size = ctx->nb_blocks >> 2;
905 u32 nb_segments = first_pass ? ctx->slice_number : 3;
906 u32 area_size = nb_segments * slice_size + offset - 1;
907
908 // Computes the starting position of the reference area.
909 // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
910 // NEXT SEGMENT, NOT THE NEXT BLOCK.
911 u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
912 u32 start_pos = first_pass ? 0 : next_slice;
913
914 // Generate offset from J1 (no need for J2, there's only one lane)
915 u64 j1 = ctx->b.a[index] & 0xffffffff; // pseudo-random number
916 u64 x = (j1 * j1) >> 32;
917 u64 y = (area_size * x) >> 32;
918 u64 z = (area_size - 1) - y;
919 u64 ref = start_pos + z; // ref < 2 * nb_blocks
920 return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
921 }
922
923 // Main algorithm
crypto_argon2i_general(u8 * hash,u32 hash_size,void * work_area,u32 nb_blocks,u32 nb_iterations,const u8 * password,u32 password_size,const u8 * salt,u32 salt_size,const u8 * key,u32 key_size,const u8 * ad,u32 ad_size)924 void crypto_argon2i_general(u8 *hash, u32 hash_size,
925 void *work_area, u32 nb_blocks,
926 u32 nb_iterations,
927 const u8 *password, u32 password_size,
928 const u8 *salt, u32 salt_size,
929 const u8 *key, u32 key_size,
930 const u8 *ad, u32 ad_size)
931 {
932 // work area seen as blocks (must be suitably aligned)
933 block *blocks = (block*)work_area;
934 {
935 crypto_blake2b_ctx ctx;
936 crypto_blake2b_init(&ctx);
937
938 blake_update_32 (&ctx, 1 ); // p: number of threads
939 blake_update_32 (&ctx, hash_size );
940 blake_update_32 (&ctx, nb_blocks );
941 blake_update_32 (&ctx, nb_iterations);
942 blake_update_32 (&ctx, 0x13 ); // v: version number
943 blake_update_32 (&ctx, 1 ); // y: Argon2i
944 blake_update_32 (&ctx, password_size);
945 crypto_blake2b_update(&ctx, password, password_size);
946 blake_update_32 (&ctx, salt_size);
947 crypto_blake2b_update(&ctx, salt, salt_size);
948 blake_update_32 (&ctx, key_size);
949 crypto_blake2b_update(&ctx, key, key_size);
950 blake_update_32 (&ctx, ad_size);
951 crypto_blake2b_update(&ctx, ad, ad_size);
952
953 u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
954 crypto_blake2b_final(&ctx, initial_hash);
955
956 // fill first 2 blocks
957 block tmp_block;
958 u8 hash_area[1024];
959 store32_le(initial_hash + 64, 0); // first additional word
960 store32_le(initial_hash + 68, 0); // second additional word
961 extended_hash(hash_area, 1024, initial_hash, 72);
962 load_block(&tmp_block, hash_area);
963 copy_block(blocks, &tmp_block);
964
965 store32_le(initial_hash + 64, 1); // slight modification
966 extended_hash(hash_area, 1024, initial_hash, 72);
967 load_block(&tmp_block, hash_area);
968 copy_block(blocks + 1, &tmp_block);
969
970 WIPE_BUFFER(initial_hash);
971 WIPE_BUFFER(hash_area);
972 wipe_block(&tmp_block);
973 }
974
975 // Actual number of blocks
976 nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread)
977 const u32 segment_size = nb_blocks >> 2;
978
979 // fill (then re-fill) the rest of the blocks
980 block tmp;
981 gidx_ctx ctx; // public information, no need to wipe
982 FOR_T (u32, pass_number, 0, nb_iterations) {
983 int first_pass = pass_number == 0;
984
985 FOR_T (u32, segment, 0, 4) {
986 gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
987
988 // On the first segment of the first pass,
989 // blocks 0 and 1 are already filled.
990 // We use the offset to skip them.
991 u32 start_offset = first_pass && segment == 0 ? 2 : 0;
992 u32 segment_start = segment * segment_size + start_offset;
993 u32 segment_end = (segment + 1) * segment_size;
994 FOR_T (u32, current_block, segment_start, segment_end) {
995 u32 reference_block = gidx_next(&ctx);
996 u32 previous_block = current_block == 0
997 ? nb_blocks - 1
998 : current_block - 1;
999 block *c = blocks + current_block;
1000 block *p = blocks + previous_block;
1001 block *r = blocks + reference_block;
1002 if (first_pass) { g_copy(c, p, r, &tmp); }
1003 else { g_xor (c, p, r, &tmp); }
1004 }
1005 }
1006 }
1007 wipe_block(&tmp);
1008 u8 final_block[1024];
1009 store_block(final_block, blocks + (nb_blocks - 1));
1010
1011 // wipe work area
1012 volatile u64 *p = (u64*)work_area;
1013 ZERO(p, 128 * nb_blocks);
1014
1015 // hash the very last block with H' into the output hash
1016 extended_hash(hash, hash_size, final_block, 1024);
1017 WIPE_BUFFER(final_block);
1018 }
1019
crypto_argon2i(u8 * hash,u32 hash_size,void * work_area,u32 nb_blocks,u32 nb_iterations,const u8 * password,u32 password_size,const u8 * salt,u32 salt_size)1020 void crypto_argon2i(u8 *hash, u32 hash_size,
1021 void *work_area, u32 nb_blocks, u32 nb_iterations,
1022 const u8 *password, u32 password_size,
1023 const u8 *salt, u32 salt_size)
1024 {
1025 crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations,
1026 password, password_size, salt , salt_size, 0,0,0,0);
1027 }
1028
1029 ////////////////////////////////////
1030 /// Arithmetic modulo 2^255 - 19 ///
1031 ////////////////////////////////////
1032 // Originally taken from SUPERCOP's ref10 implementation.
1033 // A bit bigger than TweetNaCl, over 4 times faster.
1034
1035 // field element
1036 typedef i32 fe[10];
1037
1038 // field constants
1039 //
1040 // fe_one : 1
1041 // sqrtm1 : sqrt(-1)
1042 // d : -121665 / 121666
1043 // D2 : 2 * -121665 / 121666
1044 // lop_x, lop_y: low order point in Edwards coordinates
1045 // ufactor : -sqrt(-1) * 2
1046 // A2 : 486662^2 (A squared)
1047 static const fe fe_one = {1};
1048 static const fe sqrtm1 = {-32595792, -7943725, 9377950, 3500415, 12389472,
1049 -272473, -25146209, -2005654, 326686, 11406482,};
1050 static const fe d = {-10913610, 13857413, -15372611, 6949391, 114729,
1051 -8787816, -6275908, -3247719, -18696448, -12055116,};
1052 static const fe D2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1053 15978800, -12551817, -6495438, 29715968, 9444199,};
1054 static const fe lop_x = {21352778, 5345713, 4660180, -8347857, 24143090,
1055 14568123, 30185756, -12247770, -33528939, 8345319,};
1056 static const fe lop_y = {-6952922, -1265500, 6862341, -7057498, -4037696,
1057 -5447722, 31680899, -15325402, -19365852, 1569102,};
1058 static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944,
1059 544946, -16816446, 4011309, -653372, 10741468,};
1060 static const fe A2 = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,};
1061
fe_0(fe h)1062 static void fe_0(fe h) { ZERO(h , 10); }
fe_1(fe h)1063 static void fe_1(fe h) { h[0] = 1; ZERO(h+1, 9); }
1064
fe_copy(fe h,const fe f)1065 static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; }
fe_neg(fe h,const fe f)1066 static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; }
fe_add(fe h,const fe f,const fe g)1067 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
fe_sub(fe h,const fe f,const fe g)1068 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
1069
fe_cswap(fe f,fe g,int b)1070 static void fe_cswap(fe f, fe g, int b)
1071 {
1072 i32 mask = -b; // -1 = 0xffffffff
1073 FOR (i, 0, 10) {
1074 i32 x = (f[i] ^ g[i]) & mask;
1075 f[i] = f[i] ^ x;
1076 g[i] = g[i] ^ x;
1077 }
1078 }
1079
fe_ccopy(fe f,const fe g,int b)1080 static void fe_ccopy(fe f, const fe g, int b)
1081 {
1082 i32 mask = -b; // -1 = 0xffffffff
1083 FOR (i, 0, 10) {
1084 i32 x = (f[i] ^ g[i]) & mask;
1085 f[i] = f[i] ^ x;
1086 }
1087 }
1088
1089
1090 // Signed carry propagation
1091 // ------------------------
1092 //
1093 // Let t be a number. It can be uniquely decomposed thus:
1094 //
1095 // t = h*2^26 + l
1096 // such that -2^25 <= l < 2^25
1097 //
1098 // Let c = (t + 2^25) / 2^26 (rounded down)
1099 // c = (h*2^26 + l + 2^25) / 2^26 (rounded down)
1100 // c = h + (l + 2^25) / 2^26 (rounded down)
1101 // c = h (exactly)
1102 // Because 0 <= l + 2^25 < 2^26
1103 //
1104 // Let u = t - c*2^26
1105 // u = h*2^26 + l - h*2^26
1106 // u = l
1107 // Therefore, -2^25 <= u < 2^25
1108 //
1109 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1110 //
1111 // Notations:
1112 // - In C, 1<<25 means 2^25.
1113 // - In C, x>>25 means floor(x / (2^25)).
1114 // - All of the above applies with 25 & 24 as well as 26 & 25.
1115 //
1116 //
1117 // Note on negative right shifts
1118 // -----------------------------
1119 //
1120 // In C, x >> n, where x is a negative integer, is implementation
1121 // defined. In practice, all platforms do arithmetic shift, which is
1122 // equivalent to division by 2^26, rounded down. Some compilers, like
1123 // GCC, even guarantee it.
1124 //
1125 // If we ever stumble upon a platform that does not propagate the sign
1126 // bit (we won't), visible failures will show at the slightest test, and
1127 // the signed shifts can be replaced by the following:
1128 //
1129 // typedef struct { i64 x:39; } s25;
1130 // typedef struct { i64 x:38; } s26;
1131 // i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1132 // i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1133 //
1134 // Current compilers cannot optimise this, causing a 30% drop in
1135 // performance. Fairly expensive for something that never happens.
1136 //
1137 //
1138 // Precondition
1139 // ------------
1140 //
1141 // |t0| < 2^63
1142 // |t1|..|t9| < 2^62
1143 //
1144 // Algorithm
1145 // ---------
1146 // c = t0 + 2^25 / 2^26 -- |c| <= 2^36
1147 // t0 -= c * 2^26 -- |t0| <= 2^25
1148 // t1 += c -- |t1| <= 2^63
1149 //
1150 // c = t4 + 2^25 / 2^26 -- |c| <= 2^36
1151 // t4 -= c * 2^26 -- |t4| <= 2^25
1152 // t5 += c -- |t5| <= 2^63
1153 //
1154 // c = t1 + 2^24 / 2^25 -- |c| <= 2^38
1155 // t1 -= c * 2^25 -- |t1| <= 2^24
1156 // t2 += c -- |t2| <= 2^63
1157 //
1158 // c = t5 + 2^24 / 2^25 -- |c| <= 2^38
1159 // t5 -= c * 2^25 -- |t5| <= 2^24
1160 // t6 += c -- |t6| <= 2^63
1161 //
1162 // c = t2 + 2^25 / 2^26 -- |c| <= 2^37
1163 // t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2)
1164 // t3 += c -- |t3| <= 2^63
1165 //
1166 // c = t6 + 2^25 / 2^26 -- |c| <= 2^37
1167 // t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6)
1168 // t7 += c -- |t7| <= 2^63
1169 //
1170 // c = t3 + 2^24 / 2^25 -- |c| <= 2^38
1171 // t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3)
1172 // t4 += c -- |t4| <= 2^25 + 2^38 < 2^39
1173 //
1174 // c = t7 + 2^24 / 2^25 -- |c| <= 2^38
1175 // t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7)
1176 // t8 += c -- |t8| <= 2^63
1177 //
1178 // c = t4 + 2^25 / 2^26 -- |c| <= 2^13
1179 // t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4)
1180 // t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5)
1181 //
1182 // c = t8 + 2^25 / 2^26 -- |c| <= 2^37
1183 // t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8)
1184 // t9 += c -- |t9| <= 2^63
1185 //
1186 // c = t9 + 2^24 / 2^25 -- |c| <= 2^38
1187 // t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9)
1188 // t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44
1189 //
1190 // c = t0 + 2^25 / 2^26 -- |c| <= 2^18
1191 // t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0)
1192 // t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1)
1193 //
1194 // Postcondition
1195 // -------------
1196 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25
1197 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24
1198 #define FE_CARRY \
1199 i64 c; \
1200 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1201 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1202 c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \
1203 c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \
1204 c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \
1205 c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \
1206 c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \
1207 c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \
1208 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1209 c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \
1210 c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \
1211 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1212 h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \
1213 h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9
1214
fe_frombytes(fe h,const u8 s[32])1215 static void fe_frombytes(fe h, const u8 s[32])
1216 {
1217 i64 t0 = load32_le(s); // t0 < 2^32
1218 i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30
1219 i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29
1220 i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27
1221 i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26
1222 i64 t5 = load32_le(s + 16); // t5 < 2^32
1223 i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31
1224 i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29
1225 i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28
1226 i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2; // t9 < 2^25
1227 FE_CARRY; // Carry recondition OK
1228 }
1229
1230 // Precondition
1231 // |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25
1232 // |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24
1233 //
1234 // Therefore, |h| < 2^255-19
1235 // There are two possibilities:
1236 //
1237 // - If h is positive, all we need to do is reduce its individual
1238 // limbs down to their tight positive range.
1239 // - If h is negative, we also need to add 2^255-19 to it.
1240 // Or just remove 19 and chop off any excess bit.
fe_tobytes(u8 s[32],const fe h)1241 static void fe_tobytes(u8 s[32], const fe h)
1242 {
1243 i32 t[10];
1244 COPY(t, h, 10);
1245 i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1246 // |t9| < 1.1 * 2^24
1247 // -1.1 * 2^24 < t9 < 1.1 * 2^24
1248 // -21 * 2^24 < 19 * t9 < 21 * 2^24
1249 // -2^29 < 19 * t9 + 2^24 < 2^29
1250 // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1251 // -16 < (19 * t9 + 2^24) / 2^25 < 16
1252 FOR (i, 0, 5) {
1253 q += t[2*i ]; q >>= 26; // q = 0 or -1
1254 q += t[2*i+1]; q >>= 25; // q = 0 or -1
1255 }
1256 // q = 0 iff h >= 0
1257 // q = -1 iff h < 0
1258 // Adding q * 19 to h reduces h to its proper range.
1259 q *= 19; // Shift carry back to the beginning
1260 FOR (i, 0, 5) {
1261 t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26);
1262 t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25);
1263 }
1264 // h is now fully reduced, and q represents the excess bit.
1265
1266 store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26));
1267 store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19));
1268 store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1269 store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6));
1270 store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25));
1271 store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19));
1272 store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1273 store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6));
1274
1275 WIPE_BUFFER(t);
1276 }
1277
1278 // Precondition
1279 // -------------
1280 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1281 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1282 //
1283 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1284 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
fe_mul_small(fe h,const fe f,i32 g)1285 static void fe_mul_small(fe h, const fe f, i32 g)
1286 {
1287 i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g;
1288 i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g;
1289 i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g;
1290 i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g;
1291 i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g;
1292 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58
1293 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57
1294
1295 FE_CARRY; // Carry precondition OK
1296 }
1297
1298 // Precondition
1299 // -------------
1300 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1301 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1302 //
1303 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1304 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
fe_mul(fe h,const fe f,const fe g)1305 static void fe_mul(fe h, const fe f, const fe g)
1306 {
1307 // Everything is unrolled and put in temporary variables.
1308 // We could roll the loop, but that would make curve25519 twice as slow.
1309 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1310 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1311 i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1312 i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1313 i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1314 i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19;
1315 i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19;
1316 i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19;
1317 // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26
1318 // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31
1319 // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30
1320
1321 i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1322 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1323 i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1324 + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1325 i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1326 + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1327 i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1328 + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1329 i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1330 + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1331 i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1332 + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1333 i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1334 + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1335 i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1336 + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1337 i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1338 + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1339 i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1340 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1341 // t0 < 0.67 * 2^61
1342 // t1 < 0.41 * 2^61
1343 // t2 < 0.52 * 2^61
1344 // t3 < 0.32 * 2^61
1345 // t4 < 0.38 * 2^61
1346 // t5 < 0.22 * 2^61
1347 // t6 < 0.23 * 2^61
1348 // t7 < 0.13 * 2^61
1349 // t8 < 0.09 * 2^61
1350 // t9 < 0.03 * 2^61
1351
1352 FE_CARRY; // Everything below 2^62, Carry precondition OK
1353 }
1354
1355 // Precondition
1356 // -------------
1357 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1358 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1359 //
1360 // Note: we could use fe_mul() for this, but this is significantly faster
fe_sq(fe h,const fe f)1361 static void fe_sq(fe h, const fe f)
1362 {
1363 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1364 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1365 i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2;
1366 i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2;
1367 i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38;
1368 i32 f8_19 = f8*19; i32 f9_38 = f9*38;
1369 // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27
1370 // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26
1371 // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31
1372
1373 i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1374 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38;
1375 i64 t1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19
1376 + f4 *(i64)f7_38 + f5_2*(i64)f6_19;
1377 i64 t2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38
1378 + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19;
1379 i64 t3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38
1380 + f5_2*(i64)f8_19 + f6 *(i64)f7_38;
1381 i64 t4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2
1382 + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38;
1383 i64 t5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3
1384 + f6 *(i64)f9_38 + f7_2*(i64)f8_19;
1385 i64 t6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4
1386 + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19;
1387 i64 t7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5
1388 + f3_2*(i64)f4 + f8 *(i64)f9_38;
1389 i64 t8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6
1390 + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38;
1391 i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7
1392 + f3_2*(i64)f6 + f4 *(i64)f5_2;
1393 // t0 < 0.67 * 2^61
1394 // t1 < 0.41 * 2^61
1395 // t2 < 0.52 * 2^61
1396 // t3 < 0.32 * 2^61
1397 // t4 < 0.38 * 2^61
1398 // t5 < 0.22 * 2^61
1399 // t6 < 0.23 * 2^61
1400 // t7 < 0.13 * 2^61
1401 // t8 < 0.09 * 2^61
1402 // t9 < 0.03 * 2^61
1403
1404 FE_CARRY;
1405 }
1406
1407 // h = 2 * (f^2)
1408 //
1409 // Precondition
1410 // -------------
1411 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1412 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1413 //
1414 // Note: we could implement fe_sq2() by copying fe_sq(), multiplying
1415 // each limb by 2, *then* perform the carry. This saves one carry.
1416 // However, doing so with the stated preconditions does not work (t2
1417 // would overflow). There are 3 ways to solve this:
1418 //
1419 // 1. Show that t2 actually never overflows (it really does not).
1420 // 2. Accept an additional carry, at a small lost of performance.
1421 // 3. Make sure the input of fe_sq2() is freshly carried.
1422 //
1423 // SUPERCOP ref10 relies on (1).
1424 // Monocypher chose (2) and (3), mostly to save code.
fe_sq2(fe h,const fe f)1425 static void fe_sq2(fe h, const fe f)
1426 {
1427 fe_sq(h, f);
1428 fe_mul_small(h, h, 2);
1429 }
1430
1431 // This could be simplified, but it would be slower
fe_pow22523(fe out,const fe z)1432 static void fe_pow22523(fe out, const fe z)
1433 {
1434 fe t0, t1, t2;
1435 fe_sq(t0, z);
1436 fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, z, t1);
1437 fe_mul(t0, t0, t1);
1438 fe_sq(t0, t0); fe_mul(t0, t1, t0);
1439 fe_sq(t1, t0); FOR (i, 1, 5) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1440 fe_sq(t1, t0); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t1, t1, t0);
1441 fe_sq(t2, t1); FOR (i, 1, 20) fe_sq(t2, t2); fe_mul(t1, t2, t1);
1442 fe_sq(t1, t1); FOR (i, 1, 10) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1443 fe_sq(t1, t0); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t1, t1, t0);
1444 fe_sq(t2, t1); FOR (i, 1, 100) fe_sq(t2, t2); fe_mul(t1, t2, t1);
1445 fe_sq(t1, t1); FOR (i, 1, 50) fe_sq(t1, t1); fe_mul(t0, t1, t0);
1446 fe_sq(t0, t0); FOR (i, 1, 2) fe_sq(t0, t0); fe_mul(out, t0, z);
1447 WIPE_BUFFER(t0);
1448 WIPE_BUFFER(t1);
1449 WIPE_BUFFER(t2);
1450 }
1451
1452 // Inverting means multiplying by 2^255 - 21
1453 // 2^255 - 21 = (2^252 - 3) * 8 + 3
1454 // So we reuse the multiplication chain of fe_pow22523
fe_invert(fe out,const fe z)1455 static void fe_invert(fe out, const fe z)
1456 {
1457 fe tmp;
1458 fe_pow22523(tmp, z);
1459 // tmp2^8 * z^3
1460 fe_sq(tmp, tmp); // 0
1461 fe_sq(tmp, tmp); fe_mul(tmp, tmp, z); // 1
1462 fe_sq(tmp, tmp); fe_mul(out, tmp, z); // 1
1463 WIPE_BUFFER(tmp);
1464 }
1465
1466 // Parity check. Returns 0 if even, 1 if odd
fe_isodd(const fe f)1467 static int fe_isodd(const fe f)
1468 {
1469 u8 s[32];
1470 fe_tobytes(s, f);
1471 u8 isodd = s[0] & 1;
1472 WIPE_BUFFER(s);
1473 return isodd;
1474 }
1475
1476 // Returns 1 if equal, 0 if not equal
fe_isequal(const fe f,const fe g)1477 static int fe_isequal(const fe f, const fe g)
1478 {
1479 u8 fs[32];
1480 u8 gs[32];
1481 fe_tobytes(fs, f);
1482 fe_tobytes(gs, g);
1483 int isdifferent = crypto_verify32(fs, gs);
1484 WIPE_BUFFER(fs);
1485 WIPE_BUFFER(gs);
1486 return 1 + isdifferent;
1487 }
1488
1489 // Inverse square root.
1490 // Returns true if x is a non zero square, false otherwise.
1491 // After the call:
1492 // isr = sqrt(1/x) if x is non-zero square.
1493 // isr = sqrt(sqrt(-1)/x) if x is not a square.
1494 // isr = 0 if x is zero.
1495 // We do not guarantee the sign of the square root.
1496 //
1497 // Notes:
1498 // Let quartic = x^((p-1)/4)
1499 //
1500 // x^((p-1)/2) = chi(x)
1501 // quartic^2 = chi(x)
1502 // quartic = sqrt(chi(x))
1503 // quartic = 1 or -1 or sqrt(-1) or -sqrt(-1)
1504 //
1505 // Note that x is a square if quartic is 1 or -1
1506 // There are 4 cases to consider:
1507 //
1508 // if quartic = 1 (x is a square)
1509 // then x^((p-1)/4) = 1
1510 // x^((p-5)/4) * x = 1
1511 // x^((p-5)/4) = 1/x
1512 // x^((p-5)/8) = sqrt(1/x) or -sqrt(1/x)
1513 //
1514 // if quartic = -1 (x is a square)
1515 // then x^((p-1)/4) = -1
1516 // x^((p-5)/4) * x = -1
1517 // x^((p-5)/4) = -1/x
1518 // x^((p-5)/8) = sqrt(-1) / sqrt(x)
1519 // x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1520 // x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1521 // x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1522 //
1523 // if quartic = sqrt(-1) (x is not a square)
1524 // then x^((p-1)/4) = sqrt(-1)
1525 // x^((p-5)/4) * x = sqrt(-1)
1526 // x^((p-5)/4) = sqrt(-1)/x
1527 // x^((p-5)/8) = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1528 //
1529 // Note that the product of two non-squares is always a square:
1530 // For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1531 // Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1532 // Therefore a*b is a square.
1533 //
1534 // Since sqrt(-1) and x are both non-squares, their product is a
1535 // square, and we can compute their square root.
1536 //
1537 // if quartic = -sqrt(-1) (x is not a square)
1538 // then x^((p-1)/4) = -sqrt(-1)
1539 // x^((p-5)/4) * x = -sqrt(-1)
1540 // x^((p-5)/4) = -sqrt(-1)/x
1541 // x^((p-5)/8) = sqrt(-sqrt(-1)/x)
1542 // x^((p-5)/8) = sqrt( sqrt(-1)/x) * sqrt(-1)
1543 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1544 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1545 // x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
invsqrt(fe isr,const fe x)1546 static int invsqrt(fe isr, const fe x)
1547 {
1548 fe check, quartic;
1549 fe_copy(check, x);
1550 fe_pow22523(isr, check);
1551 fe_sq (quartic, isr);
1552 fe_mul(quartic, quartic, check);
1553 fe_1 (check); int p1 = fe_isequal(quartic, check);
1554 fe_neg(check, check ); int m1 = fe_isequal(quartic, check);
1555 fe_neg(check, sqrtm1); int ms = fe_isequal(quartic, check);
1556 fe_mul(check, isr, sqrtm1);
1557 fe_ccopy(isr, check, m1 | ms);
1558 WIPE_BUFFER(quartic);
1559 WIPE_BUFFER(check);
1560 return p1 | m1;
1561 }
1562
1563 // trim a scalar for scalar multiplication
trim_scalar(u8 scalar[32])1564 static void trim_scalar(u8 scalar[32])
1565 {
1566 scalar[ 0] &= 248;
1567 scalar[31] &= 127;
1568 scalar[31] |= 64;
1569 }
1570
1571 // get bit from scalar at position i
scalar_bit(const u8 s[32],int i)1572 static int scalar_bit(const u8 s[32], int i)
1573 {
1574 if (i < 0) { return 0; } // handle -1 for sliding windows
1575 return (s[i>>3] >> (i&7)) & 1;
1576 }
1577
1578 ///////////////
1579 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1580 ///////////////
scalarmult(u8 q[32],const u8 scalar[32],const u8 p[32],int nb_bits)1581 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1582 int nb_bits)
1583 {
1584 // computes the scalar product
1585 fe x1;
1586 fe_frombytes(x1, p);
1587
1588 // computes the actual scalar product (the result is in x2 and z2)
1589 fe x2, z2, x3, z3, t0, t1;
1590 // Montgomery ladder
1591 // In projective coordinates, to avoid divisions: x = X / Z
1592 // We don't care about the y coordinate, it's only 1 bit of information
1593 fe_1(x2); fe_0(z2); // "zero" point
1594 fe_copy(x3, x1); fe_1(z3); // "one" point
1595 int swap = 0;
1596 for (int pos = nb_bits-1; pos >= 0; --pos) {
1597 // constant time conditional swap before ladder step
1598 int b = scalar_bit(scalar, pos);
1599 swap ^= b; // xor trick avoids swapping at the end of the loop
1600 fe_cswap(x2, x3, swap);
1601 fe_cswap(z2, z3, swap);
1602 swap = b; // anticipates one last swap after the loop
1603
1604 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1605 // with differential addition
1606 fe_sub(t0, x3, z3);
1607 fe_sub(t1, x2, z2);
1608 fe_add(x2, x2, z2);
1609 fe_add(z2, x3, z3);
1610 fe_mul(z3, t0, x2);
1611 fe_mul(z2, z2, t1);
1612 fe_sq (t0, t1 );
1613 fe_sq (t1, x2 );
1614 fe_add(x3, z3, z2);
1615 fe_sub(z2, z3, z2);
1616 fe_mul(x2, t1, t0);
1617 fe_sub(t1, t1, t0);
1618 fe_sq (z2, z2 );
1619 fe_mul_small(z3, t1, 121666);
1620 fe_sq (x3, x3 );
1621 fe_add(t0, t0, z3);
1622 fe_mul(z3, x1, z2);
1623 fe_mul(z2, t1, t0);
1624 }
1625 // last swap is necessary to compensate for the xor trick
1626 // Note: after this swap, P3 == P2 + P1.
1627 fe_cswap(x2, x3, swap);
1628 fe_cswap(z2, z3, swap);
1629
1630 // normalises the coordinates: x == X / Z
1631 fe_invert(z2, z2);
1632 fe_mul(x2, x2, z2);
1633 fe_tobytes(q, x2);
1634
1635 WIPE_BUFFER(x1);
1636 WIPE_BUFFER(x2); WIPE_BUFFER(z2); WIPE_BUFFER(t0);
1637 WIPE_BUFFER(x3); WIPE_BUFFER(z3); WIPE_BUFFER(t1);
1638 }
1639
crypto_x25519(u8 raw_shared_secret[32],const u8 your_secret_key[32],const u8 their_public_key[32])1640 void crypto_x25519(u8 raw_shared_secret[32],
1641 const u8 your_secret_key [32],
1642 const u8 their_public_key [32])
1643 {
1644 // restrict the possible scalar values
1645 u8 e[32];
1646 COPY(e, your_secret_key, 32);
1647 trim_scalar(e);
1648 scalarmult(raw_shared_secret, e, their_public_key, 255);
1649 WIPE_BUFFER(e);
1650 }
1651
crypto_x25519_public_key(u8 public_key[32],const u8 secret_key[32])1652 void crypto_x25519_public_key(u8 public_key[32],
1653 const u8 secret_key[32])
1654 {
1655 static const u8 base_point[32] = {9};
1656 crypto_x25519(public_key, secret_key, base_point);
1657 }
1658
1659 ///////////////////////////
1660 /// Arithmetic modulo L ///
1661 ///////////////////////////
1662 static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1663 0x00000000, 0x00000000, 0x00000000, 0x10000000,};
1664
1665 // p = a*b + p
multiply(u32 p[16],const u32 a[8],const u32 b[8])1666 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1667 {
1668 FOR (i, 0, 8) {
1669 u64 carry = 0;
1670 FOR (j, 0, 8) {
1671 carry += p[i+j] + (u64)a[i] * b[j];
1672 p[i+j] = (u32)carry;
1673 carry >>= 32;
1674 }
1675 p[i+8] = (u32)carry;
1676 }
1677 }
1678
is_above_l(const u32 x[8])1679 static int is_above_l(const u32 x[8])
1680 {
1681 // We work with L directly, in a 2's complement encoding
1682 // (-L == ~L + 1)
1683 u64 carry = 1;
1684 FOR (i, 0, 8) {
1685 carry += (u64)x[i] + ~L[i];
1686 carry >>= 32;
1687 }
1688 return carry;
1689 }
1690
1691 // Final reduction modulo L, by conditionally removing L.
1692 // if x < l , then r = x
1693 // if l <= x 2*l, then r = x-l
1694 // otherwise the result will be wrong
remove_l(u32 r[8],const u32 x[8])1695 static void remove_l(u32 r[8], const u32 x[8])
1696 {
1697 u64 carry = is_above_l(x);
1698 u32 mask = ~(u32)carry + 1; // carry == 0 or 1
1699 FOR (i, 0, 8) {
1700 carry += (u64)x[i] + (~L[i] & mask);
1701 r[i] = (u32)carry;
1702 carry >>= 32;
1703 }
1704 }
1705
1706 // Full reduction modulo L (Barrett reduction)
mod_l(u8 reduced[32],const u32 x[16])1707 static void mod_l(u8 reduced[32], const u32 x[16])
1708 {
1709 static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1710 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,};
1711 // xr = x * r
1712 u32 xr[25] = {0};
1713 FOR (i, 0, 9) {
1714 u64 carry = 0;
1715 FOR (j, 0, 16) {
1716 carry += xr[i+j] + (u64)r[i] * x[j];
1717 xr[i+j] = (u32)carry;
1718 carry >>= 32;
1719 }
1720 xr[i+16] = (u32)carry;
1721 }
1722 // xr = floor(xr / 2^512) * L
1723 // Since the result is guaranteed to be below 2*L,
1724 // it is enough to only compute the first 256 bits.
1725 // The division is performed by saying xr[i+16]. (16 * 32 = 512)
1726 ZERO(xr, 8);
1727 FOR (i, 0, 8) {
1728 u64 carry = 0;
1729 FOR (j, 0, 8-i) {
1730 carry += xr[i+j] + (u64)xr[i+16] * L[j];
1731 xr[i+j] = (u32)carry;
1732 carry >>= 32;
1733 }
1734 }
1735 // xr = x - xr
1736 u64 carry = 1;
1737 FOR (i, 0, 8) {
1738 carry += (u64)x[i] + ~xr[i];
1739 xr[i] = (u32)carry;
1740 carry >>= 32;
1741 }
1742 // Final reduction modulo L (conditional subtraction)
1743 remove_l(xr, xr);
1744 store32_le_buf(reduced, xr, 8);
1745
1746 WIPE_BUFFER(xr);
1747 }
1748
reduce(u8 r[64])1749 static void reduce(u8 r[64])
1750 {
1751 u32 x[16];
1752 load32_le_buf(x, r, 16);
1753 mod_l(r, x);
1754 WIPE_BUFFER(x);
1755 }
1756
1757 // r = (a * b) + c
mul_add(u8 r[32],const u8 a[32],const u8 b[32],const u8 c[32])1758 static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
1759 {
1760 u32 A[8]; load32_le_buf(A, a, 8);
1761 u32 B[8]; load32_le_buf(B, b, 8);
1762 u32 p[16];
1763 load32_le_buf(p, c, 8);
1764 ZERO(p + 8, 8);
1765 multiply(p, A, B);
1766 mod_l(r, p);
1767 WIPE_BUFFER(p);
1768 WIPE_BUFFER(A);
1769 WIPE_BUFFER(B);
1770 }
1771
1772 ///////////////
1773 /// Ed25519 ///
1774 ///////////////
1775
1776 // Point (group element, ge) in a twisted Edwards curve,
1777 // in extended projective coordinates.
1778 // ge : x = X/Z, y = Y/Z, T = XY/Z
1779 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1780 // ge_precomp: Z = 1
1781 typedef struct { fe X; fe Y; fe Z; fe T; } ge;
1782 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1783 typedef struct { fe Yp; fe Ym; fe T2; } ge_precomp;
1784
ge_zero(ge * p)1785 static void ge_zero(ge *p)
1786 {
1787 fe_0(p->X);
1788 fe_1(p->Y);
1789 fe_1(p->Z);
1790 fe_0(p->T);
1791 }
1792
ge_tobytes(u8 s[32],const ge * h)1793 static void ge_tobytes(u8 s[32], const ge *h)
1794 {
1795 fe recip, x, y;
1796 fe_invert(recip, h->Z);
1797 fe_mul(x, h->X, recip);
1798 fe_mul(y, h->Y, recip);
1799 fe_tobytes(s, y);
1800 s[31] ^= fe_isodd(x) << 7;
1801
1802 WIPE_BUFFER(recip);
1803 WIPE_BUFFER(x);
1804 WIPE_BUFFER(y);
1805 }
1806
1807 // h = s, where s is a point encoded in 32 bytes
1808 //
1809 // Variable time! Inputs must not be secret!
1810 // => Use only to *check* signatures.
1811 //
1812 // From the specifications:
1813 // The encoding of s contains y and the sign of x
1814 // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1815 // In extended coordinates:
1816 // X = x, Y = y, Z = 1, T = x*y
1817 //
1818 // Note that num * den is a square iff num / den is a square
1819 // If num * den is not a square, the point was not on the curve.
1820 // From the above:
1821 // Let num = y^2 - 1
1822 // Let den = d*y^2 + 1
1823 // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1824 // x = sqrt(num / den)
1825 // x = sqrt(num^2 / (num * den))
1826 // x = num * sqrt(1 / (num * den))
1827 //
1828 // Therefore, we can just compute:
1829 // num = y^2 - 1
1830 // den = d*y^2 + 1
1831 // isr = invsqrt(num * den) // abort if not square
1832 // x = num * isr
1833 // Finally, negate x if its sign is not as specified.
ge_frombytes_vartime(ge * h,const u8 s[32])1834 static int ge_frombytes_vartime(ge *h, const u8 s[32])
1835 {
1836 fe_frombytes(h->Y, s);
1837 fe_1(h->Z);
1838 fe_sq (h->T, h->Y); // t = y^2
1839 fe_mul(h->X, h->T, d ); // x = d*y^2
1840 fe_sub(h->T, h->T, h->Z); // t = y^2 - 1
1841 fe_add(h->X, h->X, h->Z); // x = d*y^2 + 1
1842 fe_mul(h->X, h->T, h->X); // x = (y^2 - 1) * (d*y^2 + 1)
1843 int is_square = invsqrt(h->X, h->X);
1844 if (!is_square) {
1845 return -1; // Not on the curve, abort
1846 }
1847 fe_mul(h->X, h->T, h->X); // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1848 if (fe_isodd(h->X) != (s[31] >> 7)) {
1849 fe_neg(h->X, h->X);
1850 }
1851 fe_mul(h->T, h->X, h->Y);
1852 return 0;
1853 }
1854
ge_cache(ge_cached * c,const ge * p)1855 static void ge_cache(ge_cached *c, const ge *p)
1856 {
1857 fe_add (c->Yp, p->Y, p->X);
1858 fe_sub (c->Ym, p->Y, p->X);
1859 fe_copy(c->Z , p->Z );
1860 fe_mul (c->T2, p->T, D2 );
1861 }
1862
1863 // Internal buffers are not wiped! Inputs must not be secret!
1864 // => Use only to *check* signatures.
ge_add(ge * s,const ge * p,const ge_cached * q)1865 static void ge_add(ge *s, const ge *p, const ge_cached *q)
1866 {
1867 fe a, b;
1868 fe_add(a , p->Y, p->X );
1869 fe_sub(b , p->Y, p->X );
1870 fe_mul(a , a , q->Yp);
1871 fe_mul(b , b , q->Ym);
1872 fe_add(s->Y, a , b );
1873 fe_sub(s->X, a , b );
1874
1875 fe_add(s->Z, p->Z, p->Z );
1876 fe_mul(s->Z, s->Z, q->Z );
1877 fe_mul(s->T, p->T, q->T2);
1878 fe_add(a , s->Z, s->T );
1879 fe_sub(b , s->Z, s->T );
1880
1881 fe_mul(s->T, s->X, s->Y);
1882 fe_mul(s->X, s->X, b );
1883 fe_mul(s->Y, s->Y, a );
1884 fe_mul(s->Z, a , b );
1885 }
1886
1887 // Internal buffers are not wiped! Inputs must not be secret!
1888 // => Use only to *check* signatures.
ge_sub(ge * s,const ge * p,const ge_cached * q)1889 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1890 {
1891 ge_cached neg;
1892 fe_copy(neg.Ym, q->Yp);
1893 fe_copy(neg.Yp, q->Ym);
1894 fe_copy(neg.Z , q->Z );
1895 fe_neg (neg.T2, q->T2);
1896 ge_add(s, p, &neg);
1897 }
1898
ge_madd(ge * s,const ge * p,const ge_precomp * q,fe a,fe b)1899 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1900 {
1901 fe_add(a , p->Y, p->X );
1902 fe_sub(b , p->Y, p->X );
1903 fe_mul(a , a , q->Yp);
1904 fe_mul(b , b , q->Ym);
1905 fe_add(s->Y, a , b );
1906 fe_sub(s->X, a , b );
1907
1908 fe_add(s->Z, p->Z, p->Z );
1909 fe_mul(s->T, p->T, q->T2);
1910 fe_add(a , s->Z, s->T );
1911 fe_sub(b , s->Z, s->T );
1912
1913 fe_mul(s->T, s->X, s->Y);
1914 fe_mul(s->X, s->X, b );
1915 fe_mul(s->Y, s->Y, a );
1916 fe_mul(s->Z, a , b );
1917 }
1918
ge_msub(ge * s,const ge * p,const ge_precomp * q,fe a,fe b)1919 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1920 {
1921 fe_add(a , p->Y, p->X );
1922 fe_sub(b , p->Y, p->X );
1923 fe_mul(a , a , q->Ym);
1924 fe_mul(b , b , q->Yp);
1925 fe_add(s->Y, a , b );
1926 fe_sub(s->X, a , b );
1927
1928 fe_add(s->Z, p->Z, p->Z );
1929 fe_mul(s->T, p->T, q->T2);
1930 fe_sub(a , s->Z, s->T );
1931 fe_add(b , s->Z, s->T );
1932
1933 fe_mul(s->T, s->X, s->Y);
1934 fe_mul(s->X, s->X, b );
1935 fe_mul(s->Y, s->Y, a );
1936 fe_mul(s->Z, a , b );
1937 }
1938
ge_double(ge * s,const ge * p,ge * q)1939 static void ge_double(ge *s, const ge *p, ge *q)
1940 {
1941 fe_sq (q->X, p->X);
1942 fe_sq (q->Y, p->Y);
1943 fe_sq2(q->Z, p->Z);
1944 fe_add(q->T, p->X, p->Y);
1945 fe_sq (s->T, q->T);
1946 fe_add(q->T, q->Y, q->X);
1947 fe_sub(q->Y, q->Y, q->X);
1948 fe_sub(q->X, s->T, q->T);
1949 fe_sub(q->Z, q->Z, q->Y);
1950
1951 fe_mul(s->X, q->X , q->Z);
1952 fe_mul(s->Y, q->T , q->Y);
1953 fe_mul(s->Z, q->Y , q->Z);
1954 fe_mul(s->T, q->X , q->T);
1955 }
1956
1957 // 5-bit signed window in cached format (Niels coordinates, Z=1)
1958 static const ge_precomp b_window[8] = {
1959 {{25967493,-14356035,29566456,3660896,-12694345,
1960 4014787,27544626,-11754271,-6079156,2047605,},
1961 {-12545711,934262,-2722910,3049990,-727428,
1962 9406986,12720692,5043384,19500929,-15469378,},
1963 {-8738181,4489570,9688441,-14785194,10184609,
1964 -12363380,29287919,11864899,-24514362,-4438546,},},
1965 {{15636291,-9688557,24204773,-7912398,616977,
1966 -16685262,27787600,-14772189,28944400,-1550024,},
1967 {16568933,4717097,-11556148,-1102322,15682896,
1968 -11807043,16354577,-11775962,7689662,11199574,},
1969 {30464156,-5976125,-11779434,-15670865,23220365,
1970 15915852,7512774,10017326,-17749093,-9920357,},},
1971 {{10861363,11473154,27284546,1981175,-30064349,
1972 12577861,32867885,14515107,-15438304,10819380,},
1973 {4708026,6336745,20377586,9066809,-11272109,
1974 6594696,-25653668,12483688,-12668491,5581306,},
1975 {19563160,16186464,-29386857,4097519,10237984,
1976 -4348115,28542350,13850243,-23678021,-15815942,},},
1977 {{5153746,9909285,1723747,-2777874,30523605,
1978 5516873,19480852,5230134,-23952439,-15175766,},
1979 {-30269007,-3463509,7665486,10083793,28475525,
1980 1649722,20654025,16520125,30598449,7715701,},
1981 {28881845,14381568,9657904,3680757,-20181635,
1982 7843316,-31400660,1370708,29794553,-1409300,},},
1983 {{-22518993,-6692182,14201702,-8745502,-23510406,
1984 8844726,18474211,-1361450,-13062696,13821877,},
1985 {-6455177,-7839871,3374702,-4740862,-27098617,
1986 -10571707,31655028,-7212327,18853322,-14220951,},
1987 {4566830,-12963868,-28974889,-12240689,-7602672,
1988 -2830569,-8514358,-10431137,2207753,-3209784,},},
1989 {{-25154831,-4185821,29681144,7868801,-6854661,
1990 -9423865,-12437364,-663000,-31111463,-16132436,},
1991 {25576264,-2703214,7349804,-11814844,16472782,
1992 9300885,3844789,15725684,171356,6466918,},
1993 {23103977,13316479,9739013,-16149481,817875,
1994 -15038942,8965339,-14088058,-30714912,16193877,},},
1995 {{-33521811,3180713,-2394130,14003687,-16903474,
1996 -16270840,17238398,4729455,-18074513,9256800,},
1997 {-25182317,-4174131,32336398,5036987,-21236817,
1998 11360617,22616405,9761698,-19827198,630305,},
1999 {-13720693,2639453,-24237460,-7406481,9494427,
2000 -5774029,-6554551,-15960994,-2449256,-14291300,},},
2001 {{-3151181,-5046075,9282714,6866145,-31907062,
2002 -863023,-18940575,15033784,25105118,-7894876,},
2003 {-24326370,15950226,-31801215,-14592823,-11662737,
2004 -5090925,1573892,-2625887,2198790,-15804619,},
2005 {-3099351,10324967,-2241613,7453183,-5446979,
2006 -2735503,-13812022,-16236442,-32461234,-12290683,},},
2007 };
2008
2009 // Incremental sliding windows (left to right)
2010 // Based on Roberto Maria Avanzi[2005]
2011 typedef struct {
2012 i16 next_index; // position of the next signed digit
2013 i8 next_digit; // next signed digit (odd number below 2^window_width)
2014 u8 next_check; // point at which we must check for a new window
2015 } slide_ctx;
2016
slide_init(slide_ctx * ctx,const u8 scalar[32])2017 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
2018 {
2019 // scalar is guaranteed to be below L, either because we checked (s),
2020 // or because we reduced it modulo L (h_ram). L is under 2^253, so
2021 // so bits 253 to 255 are guaranteed to be zero. No need to test them.
2022 //
2023 // Note however that L is very close to 2^252, so bit 252 is almost
2024 // always zero. If we were to start at bit 251, the tests wouldn't
2025 // catch the off-by-one error (constructing one that does would be
2026 // prohibitively expensive).
2027 //
2028 // We should still check bit 252, though.
2029 int i = 252;
2030 while (i > 0 && scalar_bit(scalar, i) == 0) {
2031 i--;
2032 }
2033 ctx->next_check = (u8)(i + 1);
2034 ctx->next_index = -1;
2035 ctx->next_digit = -1;
2036 }
2037
slide_step(slide_ctx * ctx,int width,int i,const u8 scalar[32])2038 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
2039 {
2040 if (i == ctx->next_check) {
2041 if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
2042 ctx->next_check--;
2043 } else {
2044 // compute digit of next window
2045 int w = MIN(width, i + 1);
2046 int v = -(scalar_bit(scalar, i) << (w-1));
2047 FOR_T (int, j, 0, w-1) {
2048 v += scalar_bit(scalar, i-(w-1)+j) << j;
2049 }
2050 v += scalar_bit(scalar, i-w);
2051 int lsb = v & (~v + 1); // smallest bit of v
2052 int s = ( ((lsb & 0xAA) != 0) // log2(lsb)
2053 | (((lsb & 0xCC) != 0) << 1)
2054 | (((lsb & 0xF0) != 0) << 2));
2055 ctx->next_index = (i16)(i-(w-1)+s);
2056 ctx->next_digit = (i8) (v >> s );
2057 ctx->next_check -= (u8) w;
2058 }
2059 }
2060 return i == ctx->next_index ? ctx->next_digit: 0;
2061 }
2062
2063 #define P_W_WIDTH 3 // Affects the size of the stack
2064 #define B_W_WIDTH 5 // Affects the size of the binary
2065 #define P_W_SIZE (1<<(P_W_WIDTH-2))
2066
2067 // P = [b]B + [p]P, where B is the base point
2068 //
2069 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
2070 // => Use only to *check* signatures.
ge_double_scalarmult_vartime(ge * P,const u8 p[32],const u8 b[32])2071 static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
2072 {
2073 // cache P window for addition
2074 ge_cached cP[P_W_SIZE];
2075 {
2076 ge P2, tmp;
2077 ge_double(&P2, P, &tmp);
2078 ge_cache(&cP[0], P);
2079 FOR (i, 1, P_W_SIZE) {
2080 ge_add(&tmp, &P2, &cP[i-1]);
2081 ge_cache(&cP[i], &tmp);
2082 }
2083 }
2084
2085 // Merged double and add ladder, fused with sliding
2086 slide_ctx p_slide; slide_init(&p_slide, p);
2087 slide_ctx b_slide; slide_init(&b_slide, b);
2088 int i = MAX(p_slide.next_check, b_slide.next_check);
2089 ge *sum = P;
2090 ge_zero(sum);
2091 while (i >= 0) {
2092 ge tmp;
2093 ge_double(sum, sum, &tmp);
2094 int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
2095 int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
2096 if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
2097 if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
2098 fe t1, t2;
2099 if (b_digit > 0) { ge_madd(sum, sum, b_window + b_digit/2, t1, t2); }
2100 if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
2101 i--;
2102 }
2103 }
2104
2105 // R_check = s[B] - h_ram[pk], where B is the base point
2106 //
2107 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
2108 // => Use only to *check* signatures.
ge_r_check(u8 R_check[32],u8 s[32],u8 h_ram[32],u8 pk[32])2109 static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
2110 {
2111 ge A; // not secret, not wiped
2112 u32 s32[8]; // not secret, not wiped
2113 load32_le_buf(s32, s, 8);
2114 if (ge_frombytes_vartime(&A, pk) || // A = pk
2115 is_above_l(s32)) { // prevent s malleability
2116 return -1;
2117 }
2118 fe_neg(A.X, A.X);
2119 fe_neg(A.T, A.T); // A = -pk
2120 ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
2121 ge_tobytes(R_check, &A); // R_check = A
2122 return 0;
2123 }
2124
2125 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
2126 static const ge_precomp b_comb_low[8] = {
2127 {{-6816601,-2324159,-22559413,124364,18015490,
2128 8373481,19993724,1979872,-18549925,9085059,},
2129 {10306321,403248,14839893,9633706,8463310,
2130 -8354981,-14305673,14668847,26301366,2818560,},
2131 {-22701500,-3210264,-13831292,-2927732,-16326337,
2132 -14016360,12940910,177905,12165515,-2397893,},},
2133 {{-12282262,-7022066,9920413,-3064358,-32147467,
2134 2927790,22392436,-14852487,2719975,16402117,},
2135 {-7236961,-4729776,2685954,-6525055,-24242706,
2136 -15940211,-6238521,14082855,10047669,12228189,},
2137 {-30495588,-12893761,-11161261,3539405,-11502464,
2138 16491580,-27286798,-15030530,-7272871,-15934455,},},
2139 {{17650926,582297,-860412,-187745,-12072900,
2140 -10683391,-20352381,15557840,-31072141,-5019061,},
2141 {-6283632,-2259834,-4674247,-4598977,-4089240,
2142 12435688,-31278303,1060251,6256175,10480726,},
2143 {-13871026,2026300,-21928428,-2741605,-2406664,
2144 -8034988,7355518,15733500,-23379862,7489131,},},
2145 {{6883359,695140,23196907,9644202,-33430614,
2146 11354760,-20134606,6388313,-8263585,-8491918,},
2147 {-7716174,-13605463,-13646110,14757414,-19430591,
2148 -14967316,10359532,-11059670,-21935259,12082603,},
2149 {-11253345,-15943946,10046784,5414629,24840771,
2150 8086951,-6694742,9868723,15842692,-16224787,},},
2151 {{9639399,11810955,-24007778,-9320054,3912937,
2152 -9856959,996125,-8727907,-8919186,-14097242,},
2153 {7248867,14468564,25228636,-8795035,14346339,
2154 8224790,6388427,-7181107,6468218,-8720783,},
2155 {15513115,15439095,7342322,-10157390,18005294,
2156 -7265713,2186239,4884640,10826567,7135781,},},
2157 {{-14204238,5297536,-5862318,-6004934,28095835,
2158 4236101,-14203318,1958636,-16816875,3837147,},
2159 {-5511166,-13176782,-29588215,12339465,15325758,
2160 -15945770,-8813185,11075932,-19608050,-3776283,},
2161 {11728032,9603156,-4637821,-5304487,-7827751,
2162 2724948,31236191,-16760175,-7268616,14799772,},},
2163 {{-28842672,4840636,-12047946,-9101456,-1445464,
2164 381905,-30977094,-16523389,1290540,12798615,},
2165 {27246947,-10320914,14792098,-14518944,5302070,
2166 -8746152,-3403974,-4149637,-27061213,10749585,},
2167 {25572375,-6270368,-15353037,16037944,1146292,
2168 32198,23487090,9585613,24714571,-1418265,},},
2169 {{19844825,282124,-17583147,11004019,-32004269,
2170 -2716035,6105106,-1711007,-21010044,14338445,},
2171 {8027505,8191102,-18504907,-12335737,25173494,
2172 -5923905,15446145,7483684,-30440441,10009108,},
2173 {-14134701,-4174411,10246585,-14677495,33553567,
2174 -14012935,23366126,15080531,-7969992,7663473,},},
2175 };
2176
2177 static const ge_precomp b_comb_high[8] = {
2178 {{33055887,-4431773,-521787,6654165,951411,
2179 -6266464,-5158124,6995613,-5397442,-6985227,},
2180 {4014062,6967095,-11977872,3960002,8001989,
2181 5130302,-2154812,-1899602,-31954493,-16173976,},
2182 {16271757,-9212948,23792794,731486,-25808309,
2183 -3546396,6964344,-4767590,10976593,10050757,},},
2184 {{2533007,-4288439,-24467768,-12387405,-13450051,
2185 14542280,12876301,13893535,15067764,8594792,},
2186 {20073501,-11623621,3165391,-13119866,13188608,
2187 -11540496,-10751437,-13482671,29588810,2197295,},
2188 {-1084082,11831693,6031797,14062724,14748428,
2189 -8159962,-20721760,11742548,31368706,13161200,},},
2190 {{2050412,-6457589,15321215,5273360,25484180,
2191 124590,-18187548,-7097255,-6691621,-14604792,},
2192 {9938196,2162889,-6158074,-1711248,4278932,
2193 -2598531,-22865792,-7168500,-24323168,11746309,},
2194 {-22691768,-14268164,5965485,9383325,20443693,
2195 5854192,28250679,-1381811,-10837134,13717818,},},
2196 {{-8495530,16382250,9548884,-4971523,-4491811,
2197 -3902147,6182256,-12832479,26628081,10395408,},
2198 {27329048,-15853735,7715764,8717446,-9215518,
2199 -14633480,28982250,-5668414,4227628,242148,},
2200 {-13279943,-7986904,-7100016,8764468,-27276630,
2201 3096719,29678419,-9141299,3906709,11265498,},},
2202 {{11918285,15686328,-17757323,-11217300,-27548967,
2203 4853165,-27168827,6807359,6871949,-1075745,},
2204 {-29002610,13984323,-27111812,-2713442,28107359,
2205 -13266203,6155126,15104658,3538727,-7513788,},
2206 {14103158,11233913,-33165269,9279850,31014152,
2207 4335090,-1827936,4590951,13960841,12787712,},},
2208 {{1469134,-16738009,33411928,13942824,8092558,
2209 -8778224,-11165065,1437842,22521552,-2792954,},
2210 {31352705,-4807352,-25327300,3962447,12541566,
2211 -9399651,-27425693,7964818,-23829869,5541287,},
2212 {-25732021,-6864887,23848984,3039395,-9147354,
2213 6022816,-27421653,10590137,25309915,-1584678,},},
2214 {{-22951376,5048948,31139401,-190316,-19542447,
2215 -626310,-17486305,-16511925,-18851313,-12985140,},
2216 {-9684890,14681754,30487568,7717771,-10829709,
2217 9630497,30290549,-10531496,-27798994,-13812825,},
2218 {5827835,16097107,-24501327,12094619,7413972,
2219 11447087,28057551,-1793987,-14056981,4359312,},},
2220 {{26323183,2342588,-21887793,-1623758,-6062284,
2221 2107090,-28724907,9036464,-19618351,-13055189,},
2222 {-29697200,14829398,-4596333,14220089,-30022969,
2223 2955645,12094100,-13693652,-5941445,7047569,},
2224 {-3201977,14413268,-12058324,-16417589,-9035655,
2225 -7224648,9258160,1399236,30397584,-5684634,},},
2226 };
2227
lookup_add(ge * p,ge_precomp * tmp_c,fe tmp_a,fe tmp_b,const ge_precomp comb[8],const u8 scalar[32],int i)2228 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2229 const ge_precomp comb[8], const u8 scalar[32], int i)
2230 {
2231 u8 teeth = (u8)((scalar_bit(scalar, i) ) +
2232 (scalar_bit(scalar, i + 32) << 1) +
2233 (scalar_bit(scalar, i + 64) << 2) +
2234 (scalar_bit(scalar, i + 96) << 3));
2235 u8 high = teeth >> 3;
2236 u8 index = (teeth ^ (high - 1)) & 7;
2237 FOR (j, 0, 8) {
2238 i32 select = 1 & (((j ^ index) - 1) >> 8);
2239 fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2240 fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2241 fe_ccopy(tmp_c->T2, comb[j].T2, select);
2242 }
2243 fe_neg(tmp_a, tmp_c->T2);
2244 fe_cswap(tmp_c->T2, tmp_a , high ^ 1);
2245 fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2246 ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2247 }
2248
2249 // p = [scalar]B, where B is the base point
ge_scalarmult_base(ge * p,const u8 scalar[32])2250 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2251 {
2252 // twin 4-bits signed combs, from Mike Hamburg's
2253 // Fast and compact elliptic-curve cryptography (2012)
2254 // 1 / 2 modulo L
2255 static const u8 half_mod_L[32] = {
2256 247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2257 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
2258 // (2^256 - 1) / 2 modulo L
2259 static const u8 half_ones[32] = {
2260 142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2261 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };
2262
2263 // All bits set form: 1 means 1, 0 means -1
2264 u8 s_scalar[32];
2265 mul_add(s_scalar, scalar, half_mod_L, half_ones);
2266
2267 // Double and add ladder
2268 fe tmp_a, tmp_b; // temporaries for addition
2269 ge_precomp tmp_c; // temporary for comb lookup
2270 ge tmp_d; // temporary for doubling
2271 fe_1(tmp_c.Yp);
2272 fe_1(tmp_c.Ym);
2273 fe_0(tmp_c.T2);
2274
2275 // Save a double on the first iteration
2276 ge_zero(p);
2277 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2278 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2279 // Regular double & add for the rest
2280 for (int i = 30; i >= 0; i--) {
2281 ge_double(p, p, &tmp_d);
2282 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2283 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2284 }
2285 // Note: we could save one addition at the end if we assumed the
2286 // scalar fit in 252 bit. Which it does in practice if it is
2287 // selected at random. However, non-random, non-hashed scalars
2288 // *can* overflow 252 bits in practice. Better account for that
2289 // than leaving that kind of subtle corner case.
2290
2291 WIPE_BUFFER(tmp_a); WIPE_CTX(&tmp_d);
2292 WIPE_BUFFER(tmp_b); WIPE_CTX(&tmp_c);
2293 WIPE_BUFFER(s_scalar);
2294 }
2295
crypto_sign_public_key_custom_hash(u8 public_key[32],const u8 secret_key[32],const crypto_sign_vtable * hash)2296 void crypto_sign_public_key_custom_hash(u8 public_key[32],
2297 const u8 secret_key[32],
2298 const crypto_sign_vtable *hash)
2299 {
2300 u8 a[64];
2301 hash->hash(a, secret_key, 32);
2302 trim_scalar(a);
2303 ge A;
2304 ge_scalarmult_base(&A, a);
2305 ge_tobytes(public_key, &A);
2306 WIPE_BUFFER(a);
2307 WIPE_CTX(&A);
2308 }
2309
crypto_sign_public_key(u8 public_key[32],const u8 secret_key[32])2310 void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
2311 {
2312 crypto_sign_public_key_custom_hash(public_key, secret_key,
2313 &crypto_blake2b_vtable);
2314 }
2315
crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract * ctx,const u8 secret_key[32],const u8 public_key[32],const crypto_sign_vtable * hash)2316 void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
2317 const u8 secret_key[32],
2318 const u8 public_key[32],
2319 const crypto_sign_vtable *hash)
2320 {
2321 ctx->hash = hash; // set vtable
2322 u8 *a = ctx->buf;
2323 u8 *prefix = ctx->buf + 32;
2324 ctx->hash->hash(a, secret_key, 32);
2325 trim_scalar(a);
2326
2327 if (public_key == 0) {
2328 crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
2329 } else {
2330 COPY(ctx->pk, public_key, 32);
2331 }
2332
2333 // Deterministic part of EdDSA: Construct a nonce by hashing the message
2334 // instead of generating a random number.
2335 // An actual random number would work just fine, and would save us
2336 // the trouble of hashing the message twice. If we did that
2337 // however, the user could fuck it up and reuse the nonce.
2338 ctx->hash->init (ctx);
2339 ctx->hash->update(ctx, prefix , 32);
2340 }
2341
crypto_sign_init_first_pass(crypto_sign_ctx_abstract * ctx,const u8 secret_key[32],const u8 public_key[32])2342 void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
2343 const u8 secret_key[32],
2344 const u8 public_key[32])
2345 {
2346 crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
2347 &crypto_blake2b_vtable);
2348 }
2349
crypto_sign_update(crypto_sign_ctx_abstract * ctx,const u8 * msg,size_t msg_size)2350 void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
2351 const u8 *msg, size_t msg_size)
2352 {
2353 ctx->hash->update(ctx, msg, msg_size);
2354 }
2355
crypto_sign_init_second_pass(crypto_sign_ctx_abstract * ctx)2356 void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
2357 {
2358 u8 *r = ctx->buf + 32;
2359 u8 *half_sig = ctx->buf + 64;
2360 ctx->hash->final(ctx, r);
2361 reduce(r);
2362
2363 // first half of the signature = "random" nonce times the base point
2364 ge R;
2365 ge_scalarmult_base(&R, r);
2366 ge_tobytes(half_sig, &R);
2367 WIPE_CTX(&R);
2368
2369 // Hash R, the public key, and the message together.
2370 // It cannot be done in parallel with the first hash.
2371 ctx->hash->init (ctx);
2372 ctx->hash->update(ctx, half_sig, 32);
2373 ctx->hash->update(ctx, ctx->pk , 32);
2374 }
2375
crypto_sign_final(crypto_sign_ctx_abstract * ctx,u8 signature[64])2376 void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
2377 {
2378 u8 *a = ctx->buf;
2379 u8 *r = ctx->buf + 32;
2380 u8 *half_sig = ctx->buf + 64;
2381 u8 h_ram[64];
2382 ctx->hash->final(ctx, h_ram);
2383 reduce(h_ram);
2384 COPY(signature, half_sig, 32);
2385 mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
2386 WIPE_BUFFER(h_ram);
2387 crypto_wipe(ctx, ctx->hash->ctx_size);
2388 }
2389
crypto_sign(u8 signature[64],const u8 secret_key[32],const u8 public_key[32],const u8 * message,size_t message_size)2390 void crypto_sign(u8 signature[64],
2391 const u8 secret_key[32],
2392 const u8 public_key[32],
2393 const u8 *message, size_t message_size)
2394 {
2395 crypto_sign_ctx ctx;
2396 crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
2397 crypto_sign_init_first_pass (actx, secret_key, public_key);
2398 crypto_sign_update (actx, message, message_size);
2399 crypto_sign_init_second_pass(actx);
2400 crypto_sign_update (actx, message, message_size);
2401 crypto_sign_final (actx, signature);
2402 }
2403
crypto_check_init_custom_hash(crypto_check_ctx_abstract * ctx,const u8 signature[64],const u8 public_key[32],const crypto_sign_vtable * hash)2404 void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
2405 const u8 signature[64],
2406 const u8 public_key[32],
2407 const crypto_sign_vtable *hash)
2408 {
2409 ctx->hash = hash; // set vtable
2410 COPY(ctx->buf, signature , 64);
2411 COPY(ctx->pk , public_key, 32);
2412 ctx->hash->init (ctx);
2413 ctx->hash->update(ctx, signature , 32);
2414 ctx->hash->update(ctx, public_key, 32);
2415 }
2416
crypto_check_init(crypto_check_ctx_abstract * ctx,const u8 signature[64],const u8 public_key[32])2417 void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
2418 const u8 public_key[32])
2419 {
2420 crypto_check_init_custom_hash(ctx, signature, public_key,
2421 &crypto_blake2b_vtable);
2422 }
2423
crypto_check_update(crypto_check_ctx_abstract * ctx,const u8 * msg,size_t msg_size)2424 void crypto_check_update(crypto_check_ctx_abstract *ctx,
2425 const u8 *msg, size_t msg_size)
2426 {
2427 ctx->hash->update(ctx, msg, msg_size);
2428 }
2429
crypto_check_final(crypto_check_ctx_abstract * ctx)2430 int crypto_check_final(crypto_check_ctx_abstract *ctx)
2431 {
2432 u8 h_ram[64];
2433 ctx->hash->final(ctx, h_ram);
2434 reduce(h_ram);
2435 u8 *R = ctx->buf; // R
2436 u8 *s = ctx->buf + 32; // s
2437 u8 *R_check = ctx->pk; // overwrite ctx->pk to save stack space
2438 if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
2439 return -1;
2440 }
2441 return crypto_verify32(R, R_check); // R == R_check ? OK : fail
2442 }
2443
crypto_check(const u8 signature[64],const u8 public_key[32],const u8 * message,size_t message_size)2444 int crypto_check(const u8 signature[64], const u8 public_key[32],
2445 const u8 *message, size_t message_size)
2446 {
2447 crypto_check_ctx ctx;
2448 crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
2449 crypto_check_init (actx, signature, public_key);
2450 crypto_check_update(actx, message, message_size);
2451 return crypto_check_final(actx);
2452 }
2453
2454 ///////////////////////
2455 /// EdDSA to X25519 ///
2456 ///////////////////////
crypto_from_eddsa_private(u8 x25519[32],const u8 eddsa[32])2457 void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
2458 {
2459 u8 a[64];
2460 crypto_blake2b(a, eddsa, 32);
2461 COPY(x25519, a, 32);
2462 WIPE_BUFFER(a);
2463 }
2464
crypto_from_eddsa_public(u8 x25519[32],const u8 eddsa[32])2465 void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
2466 {
2467 fe t1, t2;
2468 fe_frombytes(t2, eddsa);
2469 fe_add(t1, fe_one, t2);
2470 fe_sub(t2, fe_one, t2);
2471 fe_invert(t2, t2);
2472 fe_mul(t1, t1, t2);
2473 fe_tobytes(x25519, t1);
2474 WIPE_BUFFER(t1);
2475 WIPE_BUFFER(t2);
2476 }
2477
2478 /////////////////////////////////////////////
2479 /// Dirty ephemeral public key generation ///
2480 /////////////////////////////////////////////
2481
2482 // Those functions generates a public key, *without* clearing the
2483 // cofactor. Sending that key over the network leaks 3 bits of the
2484 // private key. Use only to generate ephemeral keys that will be hidden
2485 // with crypto_curve_to_hidden().
2486 //
2487 // The public key is otherwise compatible with crypto_x25519() and
2488 // crypto_key_exchange() (those properly clear the cofactor).
2489 //
2490 // Note that the distribution of the resulting public keys is almost
2491 // uniform. Flipping the sign of the v coordinate (not provided by this
2492 // function), covers the entire key space almost perfectly, where
2493 // "almost" means a 2^-128 bias (undetectable). This uniformity is
2494 // needed to ensure the proper randomness of the resulting
2495 // representatives (once we apply crypto_curve_to_hidden()).
2496 //
2497 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2498 // to be confused with the prime order of the main subgroup, L, which is
2499 // 8 times less than that).
2500 //
2501 // Generating all points would require us to multiply a point of order C
2502 // (the base point plus any point of order 8) by all scalars from 0 to
2503 // C-1. Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2504 // by negating the resulting point at random, we also cover scalars from
2505 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2506 //
2507 // In practice:
2508 // - Scalars from 0 to e + 1 are never generated
2509 // - Scalars from 2^255 to 2^255 + e are never generated
2510 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2511 //
2512 // Since e < 2^128, detecting this bias requires observing over 2^100
2513 // representatives from a given source (this will never happen), *and*
2514 // recovering enough of the private key to determine that they do, or do
2515 // not, belong to the biased set (this practically requires solving
2516 // discrete logarithm, which is conjecturally intractable).
2517 //
2518 // In practice, this means the bias is impossible to detect.
2519
2520 // s + (x*L) % 8*L
2521 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
2522 // L < 2^253
2523 // x%8 < 2^3
2524 // L * (x%8) < 2^255
2525 // s < 2^255
2526 // s + L * (x%8) < 2^256
add_xl(u8 s[32],u8 x)2527 static void add_xl(u8 s[32], u8 x)
2528 {
2529 u64 mod8 = x & 7;
2530 u64 carry = 0;
2531 FOR (i , 0, 8) {
2532 carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2533 store32_le(s + 4*i, (u32)carry);
2534 carry >>= 32;
2535 }
2536 }
2537
2538 // "Small" dirty ephemeral key.
2539 // Use if you need to shrink the size of the binary, and can afford to
2540 // slow down by a factor of two (compared to the fast version)
2541 //
2542 // This version works by decoupling the cofactor from the main factor.
2543 //
2544 // - The trimmed scalar determines the main factor
2545 // - The clamped bits of the scalar determine the cofactor.
2546 //
2547 // Cofactor and main factor are combined into a single scalar, which is
2548 // then multiplied by a point of order 8*L (unlike the base point, which
2549 // has prime order). That "dirty" base point is the addition of the
2550 // regular base point (9), and a point of order 8.
crypto_x25519_dirty_small(u8 public_key[32],const u8 secret_key[32])2551 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2552 {
2553 // Base point of order 8*L
2554 // Raw scalar multiplication with it does not clear the cofactor,
2555 // and the resulting public key will reveal 3 bits of the scalar.
2556 static const u8 dirty_base_point[32] = {
2557 0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
2558 0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
2559 0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
2560 // separate the main factor & the cofactor of the scalar
2561 u8 scalar[32];
2562 COPY(scalar, secret_key, 32);
2563 trim_scalar(scalar);
2564
2565 // Separate the main factor and the cofactor
2566 //
2567 // The scalar is trimmed, so its cofactor is cleared. The three
2568 // least significant bits however still have a main factor. We must
2569 // remove it for X25519 compatibility.
2570 //
2571 // We exploit the fact that 5*L = 1 (modulo 8)
2572 // cofactor = lsb * 5 * L (modulo 8*L)
2573 // combined = scalar + cofactor (modulo 8*L)
2574 // combined = scalar + (lsb * 5 * L) (modulo 8*L)
2575 add_xl(scalar, secret_key[0] * 5);
2576 scalarmult(public_key, scalar, dirty_base_point, 256);
2577 WIPE_BUFFER(scalar);
2578 }
2579
2580 // "Fast" dirty ephemeral key
2581 // We use this one by default.
2582 //
2583 // This version works by performing a regular scalar multiplication,
2584 // then add a low order point. The scalar multiplication is done in
2585 // Edwards space for more speed (*2 compared to the "small" version).
2586 // The cost is a bigger binary for programs that don't also sign messages.
crypto_x25519_dirty_fast(u8 public_key[32],const u8 secret_key[32])2587 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2588 {
2589 u8 scalar[32];
2590 ge pk;
2591 COPY(scalar, secret_key, 32);
2592 trim_scalar(scalar);
2593 ge_scalarmult_base(&pk, scalar);
2594
2595 // Select low order point
2596 // We're computing the [cofactor]lop scalar multiplication, where:
2597 // cofactor = tweak & 7.
2598 // lop = (lop_x, lop_y)
2599 // lop_x = sqrt((sqrt(d + 1) + 1) / d)
2600 // lop_y = -lop_x * sqrtm1
2601 // Notes:
2602 // - A (single) Montgomery ladder would be twice as slow.
2603 // - An actual scalar multiplication would hurt performance.
2604 // - A full table lookup would take more code.
2605 u8 cofactor = secret_key[0] & 7;
2606 int a = (cofactor >> 2) & 1;
2607 int b = (cofactor >> 1) & 1;
2608 int c = (cofactor >> 0) & 1;
2609 fe t1, t2, t3;
2610 fe_0(t1);
2611 fe_ccopy(t1, sqrtm1, b);
2612 fe_ccopy(t1, lop_x , c);
2613 fe_neg (t3, t1);
2614 fe_ccopy(t1, t3, a);
2615 fe_1(t2);
2616 fe_0(t3);
2617 fe_ccopy(t2, t3 , b);
2618 fe_ccopy(t2, lop_y, c);
2619 fe_neg (t3, t2);
2620 fe_ccopy(t2, t3, a^b);
2621 ge_precomp low_order_point;
2622 fe_add(low_order_point.Yp, t2, t1);
2623 fe_sub(low_order_point.Ym, t2, t1);
2624 fe_mul(low_order_point.T2, t2, t1);
2625 fe_mul(low_order_point.T2, low_order_point.T2, D2);
2626
2627 // Add low order point to the public key
2628 ge_madd(&pk, &pk, &low_order_point, t1, t2);
2629
2630 // Convert to Montgomery u coordinate (we ignore the sign)
2631 fe_add(t1, pk.Z, pk.Y);
2632 fe_sub(t2, pk.Z, pk.Y);
2633 fe_invert(t2, t2);
2634 fe_mul(t1, t1, t2);
2635
2636 fe_tobytes(public_key, t1);
2637
2638 WIPE_BUFFER(t1); WIPE_BUFFER(scalar);
2639 WIPE_BUFFER(t2); WIPE_CTX(&pk);
2640 WIPE_BUFFER(t3); WIPE_CTX(&low_order_point);
2641 }
2642
2643 ///////////////////
2644 /// Elligator 2 ///
2645 ///////////////////
2646 static const fe A = {486662};
2647
2648 // Elligator direct map
2649 //
2650 // Computes the point corresponding to a representative, encoded in 32
2651 // bytes (little Endian). Since positive representatives fits in 254
2652 // bits, The two most significant bits are ignored.
2653 //
2654 // From the paper:
2655 // w = -A / (fe(1) + non_square * r^2)
2656 // e = chi(w^3 + A*w^2 + w)
2657 // u = e*w - (fe(1)-e)*(A//2)
2658 // v = -e * sqrt(u^3 + A*u^2 + u)
2659 //
2660 // We ignore v because we don't need it for X25519 (the Montgomery
2661 // ladder only uses u).
2662 //
2663 // Note that e is either 0, 1 or -1
2664 // if e = 0 u = 0 and v = 0
2665 // if e = 1 u = w
2666 // if e = -1 u = -w - A = w * non_square * r^2
2667 //
2668 // Let r1 = non_square * r^2
2669 // Let r2 = 1 + r1
2670 // Note that r2 cannot be zero, -1/non_square is not a square.
2671 // We can (tediously) verify that:
2672 // w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2673 // Therefore:
2674 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2675 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2676 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2677 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3) * r2^6)
2678 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * A * r2^3)
2679 // Corollary:
2680 // e = 1 if (A^2*r1 - r2^2) * A * r2^3) is a non-zero square
2681 // e = -1 if (A^2*r1 - r2^2) * A * r2^3) is not a square
2682 // Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2683 // w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2684 // w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2685 // w^3 + A*w^2 + w = w * (w + A/2)^2 - A^2/4 + 1)
2686 // which is zero only if:
2687 // w = 0 (impossible)
2688 // (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2689 //
2690 // Let isr = invsqrt((A^2*r1 - r2^2) * A * r2^3)
2691 // isr = sqrt(1 / ((A^2*r1 - r2^2) * A * r2^3)) if e = 1
2692 // isr = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) * A * r2^3)) if e = -1
2693 //
2694 // if e = 1
2695 // let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2696 // u1 = w
2697 // u1 = u
2698 //
2699 // if e = -1
2700 // let ufactor = -non_square * sqrt(-1) * r^2
2701 // let vfactor = sqrt(ufactor)
2702 // let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2703 // u2 = w * -1 * -non_square * r^2
2704 // u2 = w * non_square * r^2
2705 // u2 = u
crypto_hidden_to_curve(uint8_t curve[32],const uint8_t hidden[32])2706 void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
2707 {
2708 // Representatives are encoded in 254 bits.
2709 // The two most significant ones are random padding that must be ignored.
2710 u8 clamped[32];
2711 COPY(clamped, hidden, 32);
2712 clamped[31] &= 0x3f;
2713
2714 fe r, u, t1, t2, t3;
2715 fe_frombytes(r, clamped);
2716 fe_sq2(t1, r);
2717 fe_add(u, t1, fe_one);
2718 fe_sq (t2, u);
2719 fe_mul(t3, A2, t1);
2720 fe_sub(t3, t3, t2);
2721 fe_mul(t3, t3, A);
2722 fe_mul(t1, t2, u);
2723 fe_mul(t1, t3, t1);
2724 int is_square = invsqrt(t1, t1);
2725 fe_sq(u, r);
2726 fe_mul(u, u, ufactor);
2727 fe_ccopy(u, fe_one, is_square);
2728 fe_sq (t1, t1);
2729 fe_mul(u, u, A);
2730 fe_mul(u, u, t3);
2731 fe_mul(u, u, t2);
2732 fe_mul(u, u, t1);
2733 fe_neg(u, u);
2734 fe_tobytes(curve, u);
2735
2736 WIPE_BUFFER(t1); WIPE_BUFFER(r);
2737 WIPE_BUFFER(t2); WIPE_BUFFER(u);
2738 WIPE_BUFFER(t3); WIPE_BUFFER(clamped);
2739 }
2740
2741 // Elligator inverse map
2742 //
2743 // Computes the representative of a point, if possible. If not, it does
2744 // nothing and returns -1. Note that the success of the operation
2745 // depends only on the point (more precisely its u coordinate). The
2746 // tweak parameter is used only upon success
2747 //
2748 // The tweak should be a random byte. Beyond that, its contents are an
2749 // implementation detail. Currently, the tweak comprises:
2750 // - Bit 1 : sign of the v coordinate (0 if positive, 1 if negative)
2751 // - Bit 2-5: not used
2752 // - Bits 6-7: random padding
2753 //
2754 // From the paper:
2755 // Let sq = -non_square * u * (u+A)
2756 // if sq is not a square, or u = -A, there is no mapping
2757 // Assuming there is a mapping:
2758 // if v is positive: r = sqrt(-(u+A) / u)
2759 // if v is negative: r = sqrt(-u / (u+A))
2760 //
2761 // We compute isr = invsqrt(-non_square * u * (u+A))
2762 // if it wasn't a non-zero square, abort.
2763 // else, isr = sqrt(-1 / (non_square * u * (u+A))
2764 //
2765 // This causes us to abort if u is zero, even though we shouldn't. This
2766 // never happens in practice, because (i) a random point in the curve has
2767 // a negligible chance of being zero, and (ii) scalar multiplication with
2768 // a trimmed scalar *never* yields zero.
2769 //
2770 // Since:
2771 // isr * (u+A) = sqrt(-1 / (non_square * u * (u+A)) * (u+A)
2772 // isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
2773 // and:
2774 // isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
2775 // isr = u = sqrt(-u / (non_square * u * (u+A))
2776 // Therefore:
2777 // if v is positive: r = isr * (u+A)
2778 // if v is negative: r = isr * u
crypto_curve_to_hidden(u8 hidden[32],const u8 public_key[32],u8 tweak)2779 int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
2780 {
2781 fe t1, t2, t3;
2782 fe_frombytes(t1, public_key);
2783
2784 fe_add(t2, t1, A);
2785 fe_mul(t3, t1, t2);
2786 fe_mul_small(t3, t3, -2);
2787 int is_square = invsqrt(t3, t3);
2788 if (!is_square) {
2789 // The only variable time bit. This ultimately reveals how many
2790 // tries it took us to find a representable key.
2791 // This does not affect security as long as we try keys at random.
2792 WIPE_BUFFER(t1);
2793 WIPE_BUFFER(t2);
2794 WIPE_BUFFER(t3);
2795 return -1;
2796 }
2797 fe_ccopy (t1, t2, tweak & 1);
2798 fe_mul (t3, t1, t3);
2799 fe_mul_small(t1, t3, 2);
2800 fe_neg (t2, t3);
2801 fe_ccopy (t3, t2, fe_isodd(t1));
2802 fe_tobytes(hidden, t3);
2803
2804 // Pad with two random bits
2805 hidden[31] |= tweak & 0xc0;
2806
2807 WIPE_BUFFER(t1);
2808 WIPE_BUFFER(t2);
2809 WIPE_BUFFER(t3);
2810 return 0;
2811 }
2812
crypto_hidden_key_pair(u8 hidden[32],u8 secret_key[32],u8 seed[32])2813 void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2814 {
2815 u8 pk [32]; // public key
2816 u8 buf[64]; // seed + representative
2817 COPY(buf + 32, seed, 32);
2818 do {
2819 crypto_chacha20(buf, 0, 64, buf+32, zero);
2820 crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2821 } while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
2822 // Note that the return value of crypto_curve_to_hidden() is
2823 // independent from its tweak parameter.
2824 // Therefore, buf[32] is not actually reused. Either we loop one
2825 // more time and buf[32] is used for the new seed, or we succeeded,
2826 // and buf[32] becomes the tweak parameter.
2827
2828 crypto_wipe(seed, 32);
2829 COPY(hidden , buf + 32, 32);
2830 COPY(secret_key, buf , 32);
2831 WIPE_BUFFER(buf);
2832 WIPE_BUFFER(pk);
2833 }
2834
2835 ////////////////////
2836 /// Key exchange ///
2837 ////////////////////
crypto_key_exchange(u8 shared_key[32],const u8 your_secret_key[32],const u8 their_public_key[32])2838 void crypto_key_exchange(u8 shared_key[32],
2839 const u8 your_secret_key [32],
2840 const u8 their_public_key[32])
2841 {
2842 crypto_x25519(shared_key, your_secret_key, their_public_key);
2843 crypto_hchacha20(shared_key, shared_key, zero);
2844 }
2845
2846 ///////////////////////
2847 /// Scalar division ///
2848 ///////////////////////
2849
2850 // Montgomery reduction.
2851 // Divides x by (2^256), and reduces the result modulo L
2852 //
2853 // Precondition:
2854 // x < L * 2^256
2855 // Constants:
2856 // r = 2^256 (makes division by r trivial)
2857 // k = (r * (1/r) - 1) // L (1/r is computed modulo L )
2858 // Algorithm:
2859 // s = (x * k) % r
2860 // t = x + s*L (t is always a multiple of r)
2861 // u = (t/r) % L (u is always below 2*L, conditional subtraction is enough)
redc(u32 u[8],u32 x[16])2862 static void redc(u32 u[8], u32 x[16])
2863 {
2864 static const u32 k[8] = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2865 0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
2866 static const u32 l[8] = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
2867 0x00000000, 0x00000000, 0x00000000, 0x10000000,};
2868 // s = x * k (modulo 2^256)
2869 // This is cheaper than the full multiplication.
2870 u32 s[8] = {0};
2871 FOR (i, 0, 8) {
2872 u64 carry = 0;
2873 FOR (j, 0, 8-i) {
2874 carry += s[i+j] + (u64)x[i] * k[j];
2875 s[i+j] = (u32)carry;
2876 carry >>= 32;
2877 }
2878 }
2879 u32 t[16] = {0};
2880 multiply(t, s, l);
2881
2882 // t = t + x
2883 u64 carry = 0;
2884 FOR (i, 0, 16) {
2885 carry += (u64)t[i] + x[i];
2886 t[i] = (u32)carry;
2887 carry >>= 32;
2888 }
2889
2890 // u = (t / 2^256) % L
2891 // Note that t / 2^256 is always below 2*L,
2892 // So a constant time conditional subtraction is enough
2893 // We work with L directly, in a 2's complement encoding
2894 // (-L == ~L + 1)
2895 remove_l(u, t+8);
2896
2897 WIPE_BUFFER(s);
2898 WIPE_BUFFER(t);
2899 }
2900
crypto_x25519_inverse(u8 blind_salt[32],const u8 private_key[32],const u8 curve_point[32])2901 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2902 const u8 curve_point[32])
2903 {
2904 static const u8 Lm2[32] = { // L - 2
2905 0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
2906 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2907 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
2908 // 1 in Montgomery form
2909 u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2910 0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};
2911
2912 u8 scalar[32];
2913 COPY(scalar, private_key, 32);
2914 trim_scalar(scalar);
2915
2916 // Convert the scalar in Montgomery form
2917 // m_scl = scalar * 2^256 (modulo L)
2918 u32 m_scl[8];
2919 {
2920 u32 tmp[16];
2921 ZERO(tmp, 8);
2922 load32_le_buf(tmp+8, scalar, 8);
2923 mod_l(scalar, tmp);
2924 load32_le_buf(m_scl, scalar, 8);
2925 WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2926 }
2927
2928 u32 product[16];
2929 for (int i = 252; i >= 0; i--) {
2930 ZERO(product, 16);
2931 multiply(product, m_inv, m_inv);
2932 redc(m_inv, product);
2933 if (scalar_bit(Lm2, i)) {
2934 ZERO(product, 16);
2935 multiply(product, m_inv, m_scl);
2936 redc(m_inv, product);
2937 }
2938 }
2939 // Convert the inverse *out* of Montgomery form
2940 // scalar = m_inv / 2^256 (modulo L)
2941 COPY(product, m_inv, 8);
2942 ZERO(product + 8, 8);
2943 redc(m_inv, product);
2944 store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2945
2946 // Clear the cofactor of scalar:
2947 // cleared = scalar * (3*L + 1) (modulo 8*L)
2948 // cleared = scalar + scalar * 3 * L (modulo 8*L)
2949 // Note that (scalar * 3) is reduced modulo 8, so we only need the
2950 // first byte.
2951 add_xl(scalar, scalar[0] * 3);
2952
2953 // Recall that 8*L < 2^256. However it is also very close to
2954 // 2^255. If we spanned the ladder over 255 bits, random tests
2955 // wouldn't catch the off-by-one error.
2956 scalarmult(blind_salt, scalar, curve_point, 256);
2957
2958 WIPE_BUFFER(scalar); WIPE_BUFFER(m_scl);
2959 WIPE_BUFFER(product); WIPE_BUFFER(m_inv);
2960 }
2961
2962 ////////////////////////////////
2963 /// Authenticated encryption ///
2964 ////////////////////////////////
lock_auth(u8 mac[16],const u8 auth_key[32],const u8 * ad,size_t ad_size,const u8 * cipher_text,size_t text_size)2965 static void lock_auth(u8 mac[16], const u8 auth_key[32],
2966 const u8 *ad , size_t ad_size,
2967 const u8 *cipher_text, size_t text_size)
2968 {
2969 u8 sizes[16]; // Not secret, not wiped
2970 store64_le(sizes + 0, ad_size);
2971 store64_le(sizes + 8, text_size);
2972 crypto_poly1305_ctx poly_ctx; // auto wiped...
2973 crypto_poly1305_init (&poly_ctx, auth_key);
2974 crypto_poly1305_update(&poly_ctx, ad , ad_size);
2975 crypto_poly1305_update(&poly_ctx, zero , align(ad_size, 16));
2976 crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2977 crypto_poly1305_update(&poly_ctx, zero , align(text_size, 16));
2978 crypto_poly1305_update(&poly_ctx, sizes , 16);
2979 crypto_poly1305_final (&poly_ctx, mac); // ...here
2980 }
2981
crypto_lock_aead(u8 mac[16],u8 * cipher_text,const u8 key[32],const u8 nonce[24],const u8 * ad,size_t ad_size,const u8 * plain_text,size_t text_size)2982 void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
2983 const u8 key[32], const u8 nonce[24],
2984 const u8 *ad , size_t ad_size,
2985 const u8 *plain_text, size_t text_size)
2986 {
2987 u8 sub_key[32];
2988 u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
2989 crypto_hchacha20(sub_key, key, nonce);
2990 crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
2991 crypto_chacha20_ctr(cipher_text, plain_text, text_size,
2992 sub_key, nonce + 16, 1);
2993 lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
2994 WIPE_BUFFER(sub_key);
2995 WIPE_BUFFER(auth_key);
2996 }
2997
crypto_unlock_aead(u8 * plain_text,const u8 key[32],const u8 nonce[24],const u8 mac[16],const u8 * ad,size_t ad_size,const u8 * cipher_text,size_t text_size)2998 int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
2999 const u8 mac[16],
3000 const u8 *ad , size_t ad_size,
3001 const u8 *cipher_text, size_t text_size)
3002 {
3003 u8 sub_key[32];
3004 u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
3005 crypto_hchacha20(sub_key, key, nonce);
3006 crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3007 u8 real_mac[16];
3008 lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
3009 WIPE_BUFFER(auth_key);
3010 if (crypto_verify16(mac, real_mac)) {
3011 WIPE_BUFFER(sub_key);
3012 WIPE_BUFFER(real_mac);
3013 return -1;
3014 }
3015 crypto_chacha20_ctr(plain_text, cipher_text, text_size,
3016 sub_key, nonce + 16, 1);
3017 WIPE_BUFFER(sub_key);
3018 WIPE_BUFFER(real_mac);
3019 return 0;
3020 }
3021
crypto_lock(u8 mac[16],u8 * cipher_text,const u8 key[32],const u8 nonce[24],const u8 * plain_text,size_t text_size)3022 void crypto_lock(u8 mac[16], u8 *cipher_text,
3023 const u8 key[32], const u8 nonce[24],
3024 const u8 *plain_text, size_t text_size)
3025 {
3026 crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
3027 }
3028
crypto_unlock(u8 * plain_text,const u8 key[32],const u8 nonce[24],const u8 mac[16],const u8 * cipher_text,size_t text_size)3029 int crypto_unlock(u8 *plain_text,
3030 const u8 key[32], const u8 nonce[24], const u8 mac[16],
3031 const u8 *cipher_text, size_t text_size)
3032 {
3033 return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
3034 cipher_text, text_size);
3035 }
3036