1 // Monocypher version 3.1.2
2 //
3 // This file is dual-licensed.  Choose whichever licence you want from
4 // the two licences listed below.
5 //
6 // The first licence is a regular 2-clause BSD licence.  The second licence
7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
8 // to the public domain.  The BSD licence serves as a fallback option.
9 //
10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
11 //
12 // ------------------------------------------------------------------------
13 //
14 // Copyright (c) 2017-2020, Loup Vaillant
15 // All rights reserved.
16 //
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // 1. Redistributions of source code must retain the above copyright
23 //    notice, this list of conditions and the following disclaimer.
24 //
25 // 2. Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimer in the
27 //    documentation and/or other materials provided with the
28 //    distribution.
29 //
30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 //
42 // ------------------------------------------------------------------------
43 //
44 // Written in 2017-2020 by Loup Vaillant
45 //
46 // To the extent possible under law, the author(s) have dedicated all copyright
47 // and related neighboring rights to this software to the public domain
48 // worldwide.  This software is distributed without any warranty.
49 //
50 // You should have received a copy of the CC0 Public Domain Dedication along
51 // with this software.  If not, see
52 // <https://creativecommons.org/publicdomain/zero/1.0/>
53 
54 #include "monocypher.h"
55 
56 /////////////////
57 /// Utilities ///
58 /////////////////
59 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
60 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
61 #define COPY(dst, src, size)       FOR(i, 0, size) (dst)[i] = (src)[i]
62 #define ZERO(buf, size)            FOR(i, 0, size) (buf)[i] = 0
63 #define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
64 #define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
65 #define MIN(a, b)                  ((a) <= (b) ? (a) : (b))
66 #define MAX(a, b)                  ((a) >= (b) ? (a) : (b))
67 
68 typedef int8_t   i8;
69 typedef uint8_t  u8;
70 typedef int16_t  i16;
71 typedef uint32_t u32;
72 typedef int32_t  i32;
73 typedef int64_t  i64;
74 typedef uint64_t u64;
75 
76 static const u8 zero[128] = {0};
77 
78 // returns the smallest positive integer y such that
79 // (x + y) % pow_2  == 0
80 // Basically, it's how many bytes we need to add to "align" x.
81 // Only works when pow_2 is a power of 2.
82 // Note: we use ~x+1 instead of -x to avoid compiler warnings
align(size_t x,size_t pow_2)83 static size_t align(size_t x, size_t pow_2)
84 {
85     return (~x + 1) & (pow_2 - 1);
86 }
87 
load24_le(const u8 s[3])88 static u32 load24_le(const u8 s[3])
89 {
90     return (u32)s[0]
91         | ((u32)s[1] <<  8)
92         | ((u32)s[2] << 16);
93 }
94 
load32_le(const u8 s[4])95 static u32 load32_le(const u8 s[4])
96 {
97     return (u32)s[0]
98         | ((u32)s[1] <<  8)
99         | ((u32)s[2] << 16)
100         | ((u32)s[3] << 24);
101 }
102 
load64_le(const u8 s[8])103 static u64 load64_le(const u8 s[8])
104 {
105     return load32_le(s) | ((u64)load32_le(s+4) << 32);
106 }
107 
store32_le(u8 out[4],u32 in)108 static void store32_le(u8 out[4], u32 in)
109 {
110     out[0] =  in        & 0xff;
111     out[1] = (in >>  8) & 0xff;
112     out[2] = (in >> 16) & 0xff;
113     out[3] = (in >> 24) & 0xff;
114 }
115 
store64_le(u8 out[8],u64 in)116 static void store64_le(u8 out[8], u64 in)
117 {
118     store32_le(out    , (u32)in );
119     store32_le(out + 4, in >> 32);
120 }
121 
load32_le_buf(u32 * dst,const u8 * src,size_t size)122 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
123     FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
124 }
load64_le_buf(u64 * dst,const u8 * src,size_t size)125 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
126     FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
127 }
store32_le_buf(u8 * dst,const u32 * src,size_t size)128 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
129     FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
130 }
store64_le_buf(u8 * dst,const u64 * src,size_t size)131 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
132     FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
133 }
134 
rotr64(u64 x,u64 n)135 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
rotl32(u32 x,u32 n)136 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
137 
neq0(u64 diff)138 static int neq0(u64 diff)
139 {   // constant time comparison to zero
140     // return diff != 0 ? -1 : 0
141     u64 half = (diff >> 32) | ((u32)diff);
142     return (1 & ((half - 1) >> 32)) - 1;
143 }
144 
x16(const u8 a[16],const u8 b[16])145 static u64 x16(const u8 a[16], const u8 b[16])
146 {
147     return (load64_le(a + 0) ^ load64_le(b + 0))
148         |  (load64_le(a + 8) ^ load64_le(b + 8));
149 }
x32(const u8 a[32],const u8 b[32])150 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
x64(const u8 a[64],const u8 b[64])151 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
crypto_verify16(const u8 a[16],const u8 b[16])152 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
crypto_verify32(const u8 a[32],const u8 b[32])153 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
crypto_verify64(const u8 a[64],const u8 b[64])154 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
155 
crypto_wipe(void * secret,size_t size)156 void crypto_wipe(void *secret, size_t size)
157 {
158     volatile u8 *v_secret = (u8*)secret;
159     ZERO(v_secret, size);
160 }
161 
162 /////////////////
163 /// Chacha 20 ///
164 /////////////////
165 #define QUARTERROUND(a, b, c, d)     \
166     a += b;  d = rotl32(d ^ a, 16);  \
167     c += d;  b = rotl32(b ^ c, 12);  \
168     a += b;  d = rotl32(d ^ a,  8);  \
169     c += d;  b = rotl32(b ^ c,  7)
170 
chacha20_rounds(u32 out[16],const u32 in[16])171 static void chacha20_rounds(u32 out[16], const u32 in[16])
172 {
173     // The temporary variables make Chacha20 10% faster.
174     u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
175     u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
176     u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
177     u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
178 
179     FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
180         QUARTERROUND(t0, t4, t8 , t12); // column 0
181         QUARTERROUND(t1, t5, t9 , t13); // column 1
182         QUARTERROUND(t2, t6, t10, t14); // column 2
183         QUARTERROUND(t3, t7, t11, t15); // column 3
184         QUARTERROUND(t0, t5, t10, t15); // diagonal 0
185         QUARTERROUND(t1, t6, t11, t12); // diagonal 1
186         QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
187         QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
188     }
189     out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
190     out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
191     out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
192     out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
193 }
194 
chacha20_init_key(u32 block[16],const u8 key[32])195 static void chacha20_init_key(u32 block[16], const u8 key[32])
196 {
197     load32_le_buf(block  , (const u8*)"expand 32-byte k", 4); // constant
198     load32_le_buf(block+4, key                          , 8); // key
199 }
200 
crypto_hchacha20(u8 out[32],const u8 key[32],const u8 in[16])201 void crypto_hchacha20(u8 out[32], const u8 key[32], const u8 in [16])
202 {
203     u32 block[16];
204     chacha20_init_key(block, key);
205     // input
206     load32_le_buf(block + 12, in, 4);
207     chacha20_rounds(block, block);
208     // prevent reversal of the rounds by revealing only half of the buffer.
209     store32_le_buf(out   , block   , 4); // constant
210     store32_le_buf(out+16, block+12, 4); // counter and nonce
211     WIPE_BUFFER(block);
212 }
213 
crypto_chacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[8],u64 ctr)214 u64 crypto_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
215                         size_t text_size, const u8 key[32], const u8 nonce[8],
216                         u64 ctr)
217 {
218     u32 input[16];
219     chacha20_init_key(input, key);
220     input[12] = (u32) ctr;
221     input[13] = (u32)(ctr >> 32);
222     load32_le_buf(input+14, nonce, 2);
223 
224     // Whole blocks
225     u32    pool[16];
226     size_t nb_blocks = text_size >> 6;
227     FOR (i, 0, nb_blocks) {
228         chacha20_rounds(pool, input);
229         if (plain_text != 0) {
230             FOR (j, 0, 16) {
231                 u32 p = pool[j] + input[j];
232                 store32_le(cipher_text, p ^ load32_le(plain_text));
233                 cipher_text += 4;
234                 plain_text  += 4;
235             }
236         } else {
237             FOR (j, 0, 16) {
238                 u32 p = pool[j] + input[j];
239                 store32_le(cipher_text, p);
240                 cipher_text += 4;
241             }
242         }
243         input[12]++;
244         if (input[12] == 0) {
245             input[13]++;
246         }
247     }
248     text_size &= 63;
249 
250     // Last (incomplete) block
251     if (text_size > 0) {
252         if (plain_text == 0) {
253             plain_text = zero;
254         }
255         chacha20_rounds(pool, input);
256         u8 tmp[64];
257         FOR (i, 0, 16) {
258             store32_le(tmp + i*4, pool[i] + input[i]);
259         }
260         FOR (i, 0, text_size) {
261             cipher_text[i] = tmp[i] ^ plain_text[i];
262         }
263         WIPE_BUFFER(tmp);
264     }
265     ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
266 
267     WIPE_BUFFER(pool);
268     WIPE_BUFFER(input);
269     return ctr;
270 }
271 
crypto_ietf_chacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[12],u32 ctr)272 u32 crypto_ietf_chacha20_ctr(u8 *cipher_text, const u8 *plain_text,
273                              size_t text_size,
274                              const u8 key[32], const u8 nonce[12], u32 ctr)
275 {
276     u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
277     return (u32)crypto_chacha20_ctr(cipher_text, plain_text, text_size,
278                                     key, nonce + 4, big_ctr);
279 }
280 
crypto_xchacha20_ctr(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[24],u64 ctr)281 u64 crypto_xchacha20_ctr(u8 *cipher_text, const u8 *plain_text,
282                          size_t text_size,
283                          const u8 key[32], const u8 nonce[24], u64 ctr)
284 {
285     u8 sub_key[32];
286     crypto_hchacha20(sub_key, key, nonce);
287     ctr = crypto_chacha20_ctr(cipher_text, plain_text, text_size,
288                               sub_key, nonce+16, ctr);
289     WIPE_BUFFER(sub_key);
290     return ctr;
291 }
292 
crypto_chacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[8])293 void crypto_chacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
294                      const u8 key[32], const u8 nonce[8])
295 {
296     crypto_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
297 
298 }
crypto_ietf_chacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[12])299 void crypto_ietf_chacha20(u8 *cipher_text, const u8 *plain_text,
300                           size_t text_size,
301                           const u8 key[32], const u8 nonce[12])
302 {
303     crypto_ietf_chacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
304 }
305 
crypto_xchacha20(u8 * cipher_text,const u8 * plain_text,size_t text_size,const u8 key[32],const u8 nonce[24])306 void crypto_xchacha20(u8 *cipher_text, const u8 *plain_text, size_t text_size,
307                       const u8 key[32], const u8 nonce[24])
308 {
309     crypto_xchacha20_ctr(cipher_text, plain_text, text_size, key, nonce, 0);
310 }
311 
312 /////////////////
313 /// Poly 1305 ///
314 /////////////////
315 
316 // h = (h + c) * r
317 // preconditions:
318 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
319 //   ctx->c <= 1_ffffffff_ffffffff_ffffffff_ffffffff
320 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
321 // Postcondition:
322 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
poly_block(crypto_poly1305_ctx * ctx)323 static void poly_block(crypto_poly1305_ctx *ctx)
324 {
325     // s = h + c, without carry propagation
326     const u64 s0 = ctx->h[0] + (u64)ctx->c[0]; // s0 <= 1_fffffffe
327     const u64 s1 = ctx->h[1] + (u64)ctx->c[1]; // s1 <= 1_fffffffe
328     const u64 s2 = ctx->h[2] + (u64)ctx->c[2]; // s2 <= 1_fffffffe
329     const u64 s3 = ctx->h[3] + (u64)ctx->c[3]; // s3 <= 1_fffffffe
330     const u32 s4 = ctx->h[4] +      ctx->c[4]; // s4 <=          5
331 
332     // Local all the things!
333     const u32 r0 = ctx->r[0];       // r0  <= 0fffffff
334     const u32 r1 = ctx->r[1];       // r1  <= 0ffffffc
335     const u32 r2 = ctx->r[2];       // r2  <= 0ffffffc
336     const u32 r3 = ctx->r[3];       // r3  <= 0ffffffc
337     const u32 rr0 = (r0 >> 2) * 5;  // rr0 <= 13fffffb // lose 2 bits...
338     const u32 rr1 = (r1 >> 2) + r1; // rr1 <= 13fffffb // rr1 == (r1 >> 2) * 5
339     const u32 rr2 = (r2 >> 2) + r2; // rr2 <= 13fffffb // rr1 == (r2 >> 2) * 5
340     const u32 rr3 = (r3 >> 2) + r3; // rr3 <= 13fffffb // rr1 == (r3 >> 2) * 5
341 
342     // (h + c) * r, without carry propagation
343     const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0; // <= 97ffffe007fffff8
344     const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1; // <= 8fffffe20ffffff6
345     const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2; // <= 87ffffe417fffff4
346     const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3; // <= 7fffffe61ffffff2
347     const u32 x4 = s4 * (r0 & 3); // ...recover 2 bits    // <=                f
348 
349     // partial reduction modulo 2^130 - 5
350     const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
351     const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
352     const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
353     const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
354     const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
355     const u64 u4 = (u3 >> 32)     + (u5 & 3);
356 
357     // Update the hash
358     ctx->h[0] = (u32)u0; // u0 <= 1_9ffffff0
359     ctx->h[1] = (u32)u1; // u1 <= 1_97ffffe0
360     ctx->h[2] = (u32)u2; // u2 <= 1_8fffffe2
361     ctx->h[3] = (u32)u3; // u3 <= 1_87ffffe4
362     ctx->h[4] = (u32)u4; // u4 <=          4
363 }
364 
365 // (re-)initialises the input counter and input buffer
poly_clear_c(crypto_poly1305_ctx * ctx)366 static void poly_clear_c(crypto_poly1305_ctx *ctx)
367 {
368     ZERO(ctx->c, 4);
369     ctx->c_idx = 0;
370 }
371 
poly_take_input(crypto_poly1305_ctx * ctx,u8 input)372 static void poly_take_input(crypto_poly1305_ctx *ctx, u8 input)
373 {
374     size_t word = ctx->c_idx >> 2;
375     size_t byte = ctx->c_idx & 3;
376     ctx->c[word] |= (u32)input << (byte * 8);
377     ctx->c_idx++;
378 }
379 
poly_update(crypto_poly1305_ctx * ctx,const u8 * message,size_t message_size)380 static void poly_update(crypto_poly1305_ctx *ctx,
381                         const u8 *message, size_t message_size)
382 {
383     FOR (i, 0, message_size) {
384         poly_take_input(ctx, message[i]);
385         if (ctx->c_idx == 16) {
386             poly_block(ctx);
387             poly_clear_c(ctx);
388         }
389     }
390 }
391 
crypto_poly1305_init(crypto_poly1305_ctx * ctx,const u8 key[32])392 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
393 {
394     // Initial hash is zero
395     ZERO(ctx->h, 5);
396     // add 2^130 to every input block
397     ctx->c[4] = 1;
398     poly_clear_c(ctx);
399     // load r and pad (r has some of its bits cleared)
400     load32_le_buf(ctx->r  , key   , 4);
401     load32_le_buf(ctx->pad, key+16, 4);
402     FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
403     FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
404 }
405 
crypto_poly1305_update(crypto_poly1305_ctx * ctx,const u8 * message,size_t message_size)406 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
407                             const u8 *message, size_t message_size)
408 {
409     if (message_size == 0) {
410         return;
411     }
412     // Align ourselves with block boundaries
413     size_t aligned = MIN(align(ctx->c_idx, 16), message_size);
414     poly_update(ctx, message, aligned);
415     message      += aligned;
416     message_size -= aligned;
417 
418     // Process the message block by block
419     size_t nb_blocks = message_size >> 4;
420     FOR (i, 0, nb_blocks) {
421         load32_le_buf(ctx->c, message, 4);
422         poly_block(ctx);
423         message += 16;
424     }
425     if (nb_blocks > 0) {
426         poly_clear_c(ctx);
427     }
428     message_size &= 15;
429 
430     // remaining bytes
431     poly_update(ctx, message, message_size);
432 }
433 
crypto_poly1305_final(crypto_poly1305_ctx * ctx,u8 mac[16])434 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
435 {
436     // Process the last block (if any)
437     if (ctx->c_idx != 0) {
438         // move the final 1 according to remaining input length
439         // (We may add less than 2^130 to the last input block)
440         ctx->c[4] = 0;
441         poly_take_input(ctx, 1);
442         // one last hash update
443         poly_block(ctx);
444     }
445 
446     // check if we should subtract 2^130-5 by performing the
447     // corresponding carry propagation.
448     u64 c = 5;
449     FOR (i, 0, 4) {
450         c  += ctx->h[i];
451         c >>= 32;
452     }
453     c += ctx->h[4];
454     c  = (c >> 2) * 5; // shift the carry back to the beginning
455     // c now indicates how many times we should subtract 2^130-5 (0 or 1)
456     FOR (i, 0, 4) {
457         c += (u64)ctx->h[i] + ctx->pad[i];
458         store32_le(mac + i*4, (u32)c);
459         c = c >> 32;
460     }
461     WIPE_CTX(ctx);
462 }
463 
crypto_poly1305(u8 mac[16],const u8 * message,size_t message_size,const u8 key[32])464 void crypto_poly1305(u8     mac[16],  const u8 *message,
465                      size_t message_size, const u8  key[32])
466 {
467     crypto_poly1305_ctx ctx;
468     crypto_poly1305_init  (&ctx, key);
469     crypto_poly1305_update(&ctx, message, message_size);
470     crypto_poly1305_final (&ctx, mac);
471 }
472 
473 ////////////////
474 /// Blake2 b ///
475 ////////////////
476 static const u64 iv[8] = {
477     0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
478     0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
479     0x510e527fade682d1, 0x9b05688c2b3e6c1f,
480     0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
481 };
482 
483 // increment the input offset
blake2b_incr(crypto_blake2b_ctx * ctx)484 static void blake2b_incr(crypto_blake2b_ctx *ctx)
485 {
486     u64   *x = ctx->input_offset;
487     size_t y = ctx->input_idx;
488     x[0] += y;
489     if (x[0] < y) {
490         x[1]++;
491     }
492 }
493 
blake2b_compress(crypto_blake2b_ctx * ctx,int is_last_block)494 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
495 {
496     static const u8 sigma[12][16] = {
497         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
498         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
499         { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
500         {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
501         {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
502         {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
503         { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
504         { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
505         {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
506         { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
507         {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
508         { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
509     };
510 
511     // init work vector
512     u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
513     u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
514     u64 v2 = ctx->hash[2];  u64 v10 = iv[2];
515     u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
516     u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
517     u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
518     u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
519     u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
520 
521     // mangle work vector
522     u64 *input = ctx->input;
523 #define BLAKE2_G(a, b, c, d, x, y)      \
524     a += b + x;  d = rotr64(d ^ a, 32); \
525     c += d;      b = rotr64(b ^ c, 24); \
526     a += b + y;  d = rotr64(d ^ a, 16); \
527     c += d;      b = rotr64(b ^ c, 63)
528 #define BLAKE2_ROUND(i)                                                 \
529     BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
530     BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
531     BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
532     BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
533     BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
534     BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
535     BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
536     BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
537 
538 #ifdef BLAKE2_NO_UNROLLING
539     FOR (i, 0, 12) {
540         BLAKE2_ROUND(i);
541     }
542 #else
543     BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
544     BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
545     BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(10); BLAKE2_ROUND(11);
546 #endif
547 
548     // update hash
549     ctx->hash[0] ^= v0 ^ v8;   ctx->hash[1] ^= v1 ^ v9;
550     ctx->hash[2] ^= v2 ^ v10;  ctx->hash[3] ^= v3 ^ v11;
551     ctx->hash[4] ^= v4 ^ v12;  ctx->hash[5] ^= v5 ^ v13;
552     ctx->hash[6] ^= v6 ^ v14;  ctx->hash[7] ^= v7 ^ v15;
553 }
554 
blake2b_set_input(crypto_blake2b_ctx * ctx,u8 input,size_t index)555 static void blake2b_set_input(crypto_blake2b_ctx *ctx, u8 input, size_t index)
556 {
557     if (index == 0) {
558         ZERO(ctx->input, 16);
559     }
560     size_t word = index >> 3;
561     size_t byte = index & 7;
562     ctx->input[word] |= (u64)input << (byte << 3);
563 
564 }
565 
blake2b_end_block(crypto_blake2b_ctx * ctx)566 static void blake2b_end_block(crypto_blake2b_ctx *ctx)
567 {
568     if (ctx->input_idx == 128) {  // If buffer is full,
569         blake2b_incr(ctx);        // update the input offset
570         blake2b_compress(ctx, 0); // and compress the (not last) block
571         ctx->input_idx = 0;
572     }
573 }
574 
blake2b_update(crypto_blake2b_ctx * ctx,const u8 * message,size_t message_size)575 static void blake2b_update(crypto_blake2b_ctx *ctx,
576                            const u8 *message, size_t message_size)
577 {
578     FOR (i, 0, message_size) {
579         blake2b_end_block(ctx);
580         blake2b_set_input(ctx, message[i], ctx->input_idx);
581         ctx->input_idx++;
582     }
583 }
584 
crypto_blake2b_general_init(crypto_blake2b_ctx * ctx,size_t hash_size,const u8 * key,size_t key_size)585 void crypto_blake2b_general_init(crypto_blake2b_ctx *ctx, size_t hash_size,
586                                  const u8           *key, size_t key_size)
587 {
588     // initial hash
589     COPY(ctx->hash, iv, 8);
590     ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
591 
592     ctx->input_offset[0] = 0;         // beginning of the input, no offset
593     ctx->input_offset[1] = 0;         // beginning of the input, no offset
594     ctx->hash_size       = hash_size; // remember the hash size we want
595     ctx->input_idx       = 0;
596 
597     // if there is a key, the first block is that key (padded with zeroes)
598     if (key_size > 0) {
599         u8 key_block[128] = {0};
600         COPY(key_block, key, key_size);
601         // same as calling crypto_blake2b_update(ctx, key_block , 128)
602         load64_le_buf(ctx->input, key_block, 16);
603         ctx->input_idx = 128;
604     }
605 }
606 
crypto_blake2b_init(crypto_blake2b_ctx * ctx)607 void crypto_blake2b_init(crypto_blake2b_ctx *ctx)
608 {
609     crypto_blake2b_general_init(ctx, 64, 0, 0);
610 }
611 
crypto_blake2b_update(crypto_blake2b_ctx * ctx,const u8 * message,size_t message_size)612 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
613                            const u8 *message, size_t message_size)
614 {
615     if (message_size == 0) {
616         return;
617     }
618     // Align ourselves with block boundaries
619     size_t aligned = MIN(align(ctx->input_idx, 128), message_size);
620     blake2b_update(ctx, message, aligned);
621     message      += aligned;
622     message_size -= aligned;
623 
624     // Process the message block by block
625     FOR (i, 0, message_size >> 7) { // number of blocks
626         blake2b_end_block(ctx);
627         load64_le_buf(ctx->input, message, 16);
628         message += 128;
629         ctx->input_idx = 128;
630     }
631     message_size &= 127;
632 
633     // remaining bytes
634     blake2b_update(ctx, message, message_size);
635 }
636 
crypto_blake2b_final(crypto_blake2b_ctx * ctx,u8 * hash)637 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
638 {
639     // Pad the end of the block with zeroes
640     FOR (i, ctx->input_idx, 128) {
641         blake2b_set_input(ctx, 0, i);
642     }
643     blake2b_incr(ctx);        // update the input offset
644     blake2b_compress(ctx, 1); // compress the last block
645     size_t nb_words = ctx->hash_size >> 3;
646     store64_le_buf(hash, ctx->hash, nb_words);
647     FOR (i, nb_words << 3, ctx->hash_size) {
648         hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
649     }
650     WIPE_CTX(ctx);
651 }
652 
crypto_blake2b_general(u8 * hash,size_t hash_size,const u8 * key,size_t key_size,const u8 * message,size_t message_size)653 void crypto_blake2b_general(u8       *hash   , size_t hash_size,
654                             const u8 *key    , size_t key_size,
655                             const u8 *message, size_t message_size)
656 {
657     crypto_blake2b_ctx ctx;
658     crypto_blake2b_general_init(&ctx, hash_size, key, key_size);
659     crypto_blake2b_update(&ctx, message, message_size);
660     crypto_blake2b_final(&ctx, hash);
661 }
662 
crypto_blake2b(u8 hash[64],const u8 * message,size_t message_size)663 void crypto_blake2b(u8 hash[64], const u8 *message, size_t message_size)
664 {
665     crypto_blake2b_general(hash, 64, 0, 0, message, message_size);
666 }
667 
blake2b_vtable_init(void * ctx)668 static void blake2b_vtable_init(void *ctx) {
669     crypto_blake2b_init(&((crypto_sign_ctx*)ctx)->hash);
670 }
blake2b_vtable_update(void * ctx,const u8 * m,size_t s)671 static void blake2b_vtable_update(void *ctx, const u8 *m, size_t s) {
672     crypto_blake2b_update(&((crypto_sign_ctx*)ctx)->hash, m, s);
673 }
blake2b_vtable_final(void * ctx,u8 * h)674 static void blake2b_vtable_final(void *ctx, u8 *h) {
675     crypto_blake2b_final(&((crypto_sign_ctx*)ctx)->hash, h);
676 }
677 const crypto_sign_vtable crypto_blake2b_vtable = {
678     crypto_blake2b,
679     blake2b_vtable_init,
680     blake2b_vtable_update,
681     blake2b_vtable_final,
682     sizeof(crypto_sign_ctx),
683 };
684 
685 ////////////////
686 /// Argon2 i ///
687 ////////////////
688 // references to R, Z, Q etc. come from the spec
689 
690 // Argon2 operates on 1024 byte blocks.
691 typedef struct { u64 a[128]; } block;
692 
wipe_block(block * b)693 static void wipe_block(block *b)
694 {
695     volatile u64* a = b->a;
696     ZERO(a, 128);
697 }
698 
699 // updates a Blake2 hash with a 32 bit word, little endian.
blake_update_32(crypto_blake2b_ctx * ctx,u32 input)700 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
701 {
702     u8 buf[4];
703     store32_le(buf, input);
704     crypto_blake2b_update(ctx, buf, 4);
705     WIPE_BUFFER(buf);
706 }
707 
load_block(block * b,const u8 bytes[1024])708 static void load_block(block *b, const u8 bytes[1024])
709 {
710     load64_le_buf(b->a, bytes, 128);
711 }
712 
store_block(u8 bytes[1024],const block * b)713 static void store_block(u8 bytes[1024], const block *b)
714 {
715     store64_le_buf(bytes, b->a, 128);
716 }
717 
copy_block(block * o,const block * in)718 static void copy_block(block *o,const block*in){FOR(i,0,128)o->a[i] = in->a[i];}
xor_block(block * o,const block * in)719 static void  xor_block(block *o,const block*in){FOR(i,0,128)o->a[i]^= in->a[i];}
720 
721 // Hash with a virtually unlimited digest size.
722 // Doesn't extract more entropy than the base hash function.
723 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
724 // (One could use a stream cipher with a seed hash as the key, but
725 //  this would introduce another dependency —and point of failure.)
extended_hash(u8 * digest,u32 digest_size,const u8 * input,u32 input_size)726 static void extended_hash(u8       *digest, u32 digest_size,
727                           const u8 *input , u32 input_size)
728 {
729     crypto_blake2b_ctx ctx;
730     crypto_blake2b_general_init(&ctx, MIN(digest_size, 64), 0, 0);
731     blake_update_32            (&ctx, digest_size);
732     crypto_blake2b_update      (&ctx, input, input_size);
733     crypto_blake2b_final       (&ctx, digest);
734 
735     if (digest_size > 64) {
736         // the conversion to u64 avoids integer overflow on
737         // ludicrously big hash sizes.
738         u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
739         u32 i   =  1;
740         u32 in  =  0;
741         u32 out = 32;
742         while (i < r) {
743             // Input and output overlap. This is intentional
744             crypto_blake2b(digest + out, digest + in, 64);
745             i   +=  1;
746             in  += 32;
747             out += 32;
748         }
749         crypto_blake2b_general(digest + out, digest_size - (32 * r),
750                                0, 0, // no key
751                                digest + in , 64);
752     }
753 }
754 
755 #define LSB(x) ((x) & 0xffffffff)
756 #define G(a, b, c, d)                                            \
757     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 32);   \
758     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 24);   \
759     a += b + 2 * LSB(a) * LSB(b);  d ^= a;  d = rotr64(d, 16);   \
760     c += d + 2 * LSB(c) * LSB(d);  b ^= c;  b = rotr64(b, 63)
761 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
762               v8,  v9, v10, v11, v12, v13, v14, v15)    \
763     G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13);          \
764     G(v2, v6, v10, v14);  G(v3, v7, v11, v15);          \
765     G(v0, v5, v10, v15);  G(v1, v6, v11, v12);          \
766     G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
767 
768 // Core of the compression function G.  Computes Z from R in place.
g_rounds(block * work_block)769 static void g_rounds(block *work_block)
770 {
771     // column rounds (work_block = Q)
772     for (int i = 0; i < 128; i += 16) {
773         ROUND(work_block->a[i     ], work_block->a[i +  1],
774               work_block->a[i +  2], work_block->a[i +  3],
775               work_block->a[i +  4], work_block->a[i +  5],
776               work_block->a[i +  6], work_block->a[i +  7],
777               work_block->a[i +  8], work_block->a[i +  9],
778               work_block->a[i + 10], work_block->a[i + 11],
779               work_block->a[i + 12], work_block->a[i + 13],
780               work_block->a[i + 14], work_block->a[i + 15]);
781     }
782     // row rounds (work_block = Z)
783     for (int i = 0; i < 16; i += 2) {
784         ROUND(work_block->a[i      ], work_block->a[i +   1],
785               work_block->a[i +  16], work_block->a[i +  17],
786               work_block->a[i +  32], work_block->a[i +  33],
787               work_block->a[i +  48], work_block->a[i +  49],
788               work_block->a[i +  64], work_block->a[i +  65],
789               work_block->a[i +  80], work_block->a[i +  81],
790               work_block->a[i +  96], work_block->a[i +  97],
791               work_block->a[i + 112], work_block->a[i + 113]);
792     }
793 }
794 
795 // The compression function G (copy version for the first pass)
g_copy(block * result,const block * x,const block * y,block * tmp)796 static void g_copy(block *result, const block *x, const block *y, block* tmp)
797 {
798     copy_block(tmp   , x  ); // tmp    = X
799     xor_block (tmp   , y  ); // tmp    = X ^ Y = R
800     copy_block(result, tmp); // result = R         (only difference with g_xor)
801     g_rounds  (tmp);         // tmp    = Z
802     xor_block (result, tmp); // result = R ^ Z
803 }
804 
805 // The compression function G (xor version for subsequent passes)
g_xor(block * result,const block * x,const block * y,block * tmp)806 static void g_xor(block *result, const block *x, const block *y, block *tmp)
807 {
808     copy_block(tmp   , x  ); // tmp    = X
809     xor_block (tmp   , y  ); // tmp    = X ^ Y = R
810     xor_block (result, tmp); // result = R ^ old   (only difference with g_copy)
811     g_rounds  (tmp);         // tmp    = Z
812     xor_block (result, tmp); // result = R ^ old ^ Z
813 }
814 
815 // Unary version of the compression function.
816 // The missing argument is implied zero.
817 // Does the transformation in place.
unary_g(block * work_block,block * tmp)818 static void unary_g(block *work_block, block *tmp)
819 {
820     // work_block == R
821     copy_block(tmp, work_block); // tmp        = R
822     g_rounds  (work_block);      // work_block = Z
823     xor_block (work_block, tmp); // work_block = Z ^ R
824 }
825 
826 // Argon2i uses a kind of stream cipher to determine which reference
827 // block it will take to synthesise the next block.  This context hold
828 // that stream's state.  (It's very similar to Chacha20.  The block b
829 // is analogous to Chacha's own pool)
830 typedef struct {
831     block b;
832     u32 pass_number;
833     u32 slice_number;
834     u32 nb_blocks;
835     u32 nb_iterations;
836     u32 ctr;
837     u32 offset;
838 } gidx_ctx;
839 
840 // The block in the context will determine array indices. To avoid
841 // timing attacks, it only depends on public information.  No looking
842 // at a previous block to seed the next.  This makes offline attacks
843 // easier, but timing attacks are the bigger threat in many settings.
gidx_refresh(gidx_ctx * ctx)844 static void gidx_refresh(gidx_ctx *ctx)
845 {
846     // seed the beginning of the block...
847     ctx->b.a[0] = ctx->pass_number;
848     ctx->b.a[1] = 0;  // lane number (we have only one)
849     ctx->b.a[2] = ctx->slice_number;
850     ctx->b.a[3] = ctx->nb_blocks;
851     ctx->b.a[4] = ctx->nb_iterations;
852     ctx->b.a[5] = 1;  // type: Argon2i
853     ctx->b.a[6] = ctx->ctr;
854     ZERO(ctx->b.a + 7, 121); // ...then zero the rest out
855 
856     // Shuffle the block thus: ctx->b = G((G(ctx->b, zero)), zero)
857     // (G "square" function), to get cheap pseudo-random numbers.
858     block tmp;
859     unary_g(&ctx->b, &tmp);
860     unary_g(&ctx->b, &tmp);
861     wipe_block(&tmp);
862 }
863 
gidx_init(gidx_ctx * ctx,u32 pass_number,u32 slice_number,u32 nb_blocks,u32 nb_iterations)864 static void gidx_init(gidx_ctx *ctx,
865                       u32 pass_number, u32 slice_number,
866                       u32 nb_blocks,   u32 nb_iterations)
867 {
868     ctx->pass_number   = pass_number;
869     ctx->slice_number  = slice_number;
870     ctx->nb_blocks     = nb_blocks;
871     ctx->nb_iterations = nb_iterations;
872     ctx->ctr           = 0;
873 
874     // Offset from the beginning of the segment.  For the first slice
875     // of the first pass, we start at the *third* block, so the offset
876     // starts at 2, not 0.
877     if (pass_number != 0 || slice_number != 0) {
878         ctx->offset = 0;
879     } else {
880         ctx->offset = 2;
881         ctx->ctr++;         // Compensates for missed lazy creation
882         gidx_refresh(ctx);  // at the start of gidx_next()
883     }
884 }
885 
gidx_next(gidx_ctx * ctx)886 static u32 gidx_next(gidx_ctx *ctx)
887 {
888     // lazily creates the offset block we need
889     if ((ctx->offset & 127) == 0) {
890         ctx->ctr++;
891         gidx_refresh(ctx);
892     }
893     u32 index  = ctx->offset & 127; // save index  for current call
894     u32 offset = ctx->offset;       // save offset for current call
895     ctx->offset++;                  // update offset for next call
896 
897     // Computes the area size.
898     // Pass 0 : all already finished segments plus already constructed
899     //          blocks in this segment
900     // Pass 1+: 3 last segments plus already constructed
901     //          blocks in this segment.  THE SPEC SUGGESTS OTHERWISE.
902     //          I CONFORM TO THE REFERENCE IMPLEMENTATION.
903     int first_pass  = ctx->pass_number == 0;
904     u32 slice_size  = ctx->nb_blocks >> 2;
905     u32 nb_segments = first_pass ? ctx->slice_number : 3;
906     u32 area_size   = nb_segments * slice_size + offset - 1;
907 
908     // Computes the starting position of the reference area.
909     // CONTRARY TO WHAT THE SPEC SUGGESTS, IT STARTS AT THE
910     // NEXT SEGMENT, NOT THE NEXT BLOCK.
911     u32 next_slice = ((ctx->slice_number + 1) & 3) * slice_size;
912     u32 start_pos  = first_pass ? 0 : next_slice;
913 
914     // Generate offset from J1 (no need for J2, there's only one lane)
915     u64 j1  = ctx->b.a[index] & 0xffffffff; // pseudo-random number
916     u64 x   = (j1 * j1)       >> 32;
917     u64 y   = (area_size * x) >> 32;
918     u64 z   = (area_size - 1) - y;
919     u64 ref = start_pos + z;                // ref < 2 * nb_blocks
920     return (u32)(ref < ctx->nb_blocks ? ref : ref - ctx->nb_blocks);
921 }
922 
923 // Main algorithm
crypto_argon2i_general(u8 * hash,u32 hash_size,void * work_area,u32 nb_blocks,u32 nb_iterations,const u8 * password,u32 password_size,const u8 * salt,u32 salt_size,const u8 * key,u32 key_size,const u8 * ad,u32 ad_size)924 void crypto_argon2i_general(u8       *hash,      u32 hash_size,
925                             void     *work_area, u32 nb_blocks,
926                             u32 nb_iterations,
927                             const u8 *password,  u32 password_size,
928                             const u8 *salt,      u32 salt_size,
929                             const u8 *key,       u32 key_size,
930                             const u8 *ad,        u32 ad_size)
931 {
932     // work area seen as blocks (must be suitably aligned)
933     block *blocks = (block*)work_area;
934     {
935         crypto_blake2b_ctx ctx;
936         crypto_blake2b_init(&ctx);
937 
938         blake_update_32      (&ctx, 1            ); // p: number of threads
939         blake_update_32      (&ctx, hash_size    );
940         blake_update_32      (&ctx, nb_blocks    );
941         blake_update_32      (&ctx, nb_iterations);
942         blake_update_32      (&ctx, 0x13         ); // v: version number
943         blake_update_32      (&ctx, 1            ); // y: Argon2i
944         blake_update_32      (&ctx,           password_size);
945         crypto_blake2b_update(&ctx, password, password_size);
946         blake_update_32      (&ctx,           salt_size);
947         crypto_blake2b_update(&ctx, salt,     salt_size);
948         blake_update_32      (&ctx,           key_size);
949         crypto_blake2b_update(&ctx, key,      key_size);
950         blake_update_32      (&ctx,           ad_size);
951         crypto_blake2b_update(&ctx, ad,       ad_size);
952 
953         u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
954         crypto_blake2b_final(&ctx, initial_hash);
955 
956         // fill first 2 blocks
957         block tmp_block;
958         u8    hash_area[1024];
959         store32_le(initial_hash + 64, 0); // first  additional word
960         store32_le(initial_hash + 68, 0); // second additional word
961         extended_hash(hash_area, 1024, initial_hash, 72);
962         load_block(&tmp_block, hash_area);
963         copy_block(blocks, &tmp_block);
964 
965         store32_le(initial_hash + 64, 1); // slight modification
966         extended_hash(hash_area, 1024, initial_hash, 72);
967         load_block(&tmp_block, hash_area);
968         copy_block(blocks + 1, &tmp_block);
969 
970         WIPE_BUFFER(initial_hash);
971         WIPE_BUFFER(hash_area);
972         wipe_block(&tmp_block);
973     }
974 
975     // Actual number of blocks
976     nb_blocks -= nb_blocks & 3; // round down to 4 p (p == 1 thread)
977     const u32 segment_size = nb_blocks >> 2;
978 
979     // fill (then re-fill) the rest of the blocks
980     block tmp;
981     gidx_ctx ctx; // public information, no need to wipe
982     FOR_T (u32, pass_number, 0, nb_iterations) {
983         int first_pass = pass_number == 0;
984 
985         FOR_T (u32, segment, 0, 4) {
986             gidx_init(&ctx, pass_number, segment, nb_blocks, nb_iterations);
987 
988             // On the first segment of the first pass,
989             // blocks 0 and 1 are already filled.
990             // We use the offset to skip them.
991             u32 start_offset  = first_pass && segment == 0 ? 2 : 0;
992             u32 segment_start = segment * segment_size + start_offset;
993             u32 segment_end   = (segment + 1) * segment_size;
994             FOR_T (u32, current_block, segment_start, segment_end) {
995                 u32 reference_block = gidx_next(&ctx);
996                 u32 previous_block  = current_block == 0
997                                     ? nb_blocks - 1
998                                     : current_block - 1;
999                 block *c = blocks + current_block;
1000                 block *p = blocks + previous_block;
1001                 block *r = blocks + reference_block;
1002                 if (first_pass) { g_copy(c, p, r, &tmp); }
1003                 else            { g_xor (c, p, r, &tmp); }
1004             }
1005         }
1006     }
1007     wipe_block(&tmp);
1008     u8 final_block[1024];
1009     store_block(final_block, blocks + (nb_blocks - 1));
1010 
1011     // wipe work area
1012     volatile u64 *p = (u64*)work_area;
1013     ZERO(p, 128 * nb_blocks);
1014 
1015     // hash the very last block with H' into the output hash
1016     extended_hash(hash, hash_size, final_block, 1024);
1017     WIPE_BUFFER(final_block);
1018 }
1019 
crypto_argon2i(u8 * hash,u32 hash_size,void * work_area,u32 nb_blocks,u32 nb_iterations,const u8 * password,u32 password_size,const u8 * salt,u32 salt_size)1020 void crypto_argon2i(u8   *hash,      u32 hash_size,
1021                     void *work_area, u32 nb_blocks, u32 nb_iterations,
1022                     const u8 *password,  u32 password_size,
1023                     const u8 *salt,      u32 salt_size)
1024 {
1025     crypto_argon2i_general(hash, hash_size, work_area, nb_blocks, nb_iterations,
1026                            password, password_size, salt , salt_size, 0,0,0,0);
1027 }
1028 
1029 ////////////////////////////////////
1030 /// Arithmetic modulo 2^255 - 19 ///
1031 ////////////////////////////////////
1032 //  Originally taken from SUPERCOP's ref10 implementation.
1033 //  A bit bigger than TweetNaCl, over 4 times faster.
1034 
1035 // field element
1036 typedef i32 fe[10];
1037 
1038 // field constants
1039 //
1040 // fe_one      : 1
1041 // sqrtm1      : sqrt(-1)
1042 // d           :     -121665 / 121666
1043 // D2          : 2 * -121665 / 121666
1044 // lop_x, lop_y: low order point in Edwards coordinates
1045 // ufactor     : -sqrt(-1) * 2
1046 // A2          : 486662^2  (A squared)
1047 static const fe fe_one  = {1};
1048 static const fe sqrtm1  = {-32595792, -7943725, 9377950, 3500415, 12389472,
1049                            -272473, -25146209, -2005654, 326686, 11406482,};
1050 static const fe d       = {-10913610, 13857413, -15372611, 6949391, 114729,
1051                            -8787816, -6275908, -3247719, -18696448, -12055116,};
1052 static const fe D2      = {-21827239, -5839606, -30745221, 13898782, 229458,
1053                            15978800, -12551817, -6495438, 29715968, 9444199,};
1054 static const fe lop_x   = {21352778, 5345713, 4660180, -8347857, 24143090,
1055                            14568123, 30185756, -12247770, -33528939, 8345319,};
1056 static const fe lop_y   = {-6952922, -1265500, 6862341, -7057498, -4037696,
1057                            -5447722, 31680899, -15325402, -19365852, 1569102,};
1058 static const fe ufactor = {-1917299, 15887451, -18755900, -7000830, -24778944,
1059                            544946, -16816446, 4011309, -653372, 10741468,};
1060 static const fe A2      = {12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,};
1061 
fe_0(fe h)1062 static void fe_0(fe h) {           ZERO(h  , 10); }
fe_1(fe h)1063 static void fe_1(fe h) { h[0] = 1; ZERO(h+1,  9); }
1064 
fe_copy(fe h,const fe f)1065 static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
fe_neg(fe h,const fe f)1066 static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
fe_add(fe h,const fe f,const fe g)1067 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
fe_sub(fe h,const fe f,const fe g)1068 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
1069 
fe_cswap(fe f,fe g,int b)1070 static void fe_cswap(fe f, fe g, int b)
1071 {
1072     i32 mask = -b; // -1 = 0xffffffff
1073     FOR (i, 0, 10) {
1074         i32 x = (f[i] ^ g[i]) & mask;
1075         f[i] = f[i] ^ x;
1076         g[i] = g[i] ^ x;
1077     }
1078 }
1079 
fe_ccopy(fe f,const fe g,int b)1080 static void fe_ccopy(fe f, const fe g, int b)
1081 {
1082     i32 mask = -b; // -1 = 0xffffffff
1083     FOR (i, 0, 10) {
1084         i32 x = (f[i] ^ g[i]) & mask;
1085         f[i] = f[i] ^ x;
1086     }
1087 }
1088 
1089 
1090 // Signed carry propagation
1091 // ------------------------
1092 //
1093 // Let t be a number.  It can be uniquely decomposed thus:
1094 //
1095 //    t = h*2^26 + l
1096 //    such that -2^25 <= l < 2^25
1097 //
1098 // Let c = (t + 2^25) / 2^26            (rounded down)
1099 //     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
1100 //     c =  h   +   (l + 2^25) / 2^26   (rounded down)
1101 //     c =  h                           (exactly)
1102 // Because 0 <= l + 2^25 < 2^26
1103 //
1104 // Let u = t          - c*2^26
1105 //     u = h*2^26 + l - h*2^26
1106 //     u = l
1107 // Therefore, -2^25 <= u < 2^25
1108 //
1109 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1110 //
1111 // Notations:
1112 // - In C, 1<<25 means 2^25.
1113 // - In C, x>>25 means floor(x / (2^25)).
1114 // - All of the above applies with 25 & 24 as well as 26 & 25.
1115 //
1116 //
1117 // Note on negative right shifts
1118 // -----------------------------
1119 //
1120 // In C, x >> n, where x is a negative integer, is implementation
1121 // defined.  In practice, all platforms do arithmetic shift, which is
1122 // equivalent to division by 2^26, rounded down.  Some compilers, like
1123 // GCC, even guarantee it.
1124 //
1125 // If we ever stumble upon a platform that does not propagate the sign
1126 // bit (we won't), visible failures will show at the slightest test, and
1127 // the signed shifts can be replaced by the following:
1128 //
1129 //     typedef struct { i64 x:39; } s25;
1130 //     typedef struct { i64 x:38; } s26;
1131 //     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1132 //     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1133 //
1134 // Current compilers cannot optimise this, causing a 30% drop in
1135 // performance.  Fairly expensive for something that never happens.
1136 //
1137 //
1138 // Precondition
1139 // ------------
1140 //
1141 // |t0|       < 2^63
1142 // |t1|..|t9| < 2^62
1143 //
1144 // Algorithm
1145 // ---------
1146 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
1147 // t0 -= c * 2^26           -- |t0| <= 2^25
1148 // t1 += c                  -- |t1| <= 2^63
1149 //
1150 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
1151 // t4 -= c * 2^26           -- |t4| <= 2^25
1152 // t5 += c                  -- |t5| <= 2^63
1153 //
1154 // c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
1155 // t1 -= c * 2^25           -- |t1| <= 2^24
1156 // t2 += c                  -- |t2| <= 2^63
1157 //
1158 // c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
1159 // t5 -= c * 2^25           -- |t5| <= 2^24
1160 // t6 += c                  -- |t6| <= 2^63
1161 //
1162 // c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
1163 // t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
1164 // t3 += c                  -- |t3| <= 2^63
1165 //
1166 // c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
1167 // t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
1168 // t7 += c                  -- |t7| <= 2^63
1169 //
1170 // c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
1171 // t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
1172 // t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
1173 //
1174 // c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
1175 // t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
1176 // t8 += c                  -- |t8| <= 2^63
1177 //
1178 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
1179 // t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
1180 // t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
1181 //
1182 // c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
1183 // t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
1184 // t9 += c                  -- |t9| <= 2^63
1185 //
1186 // c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
1187 // t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
1188 // t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
1189 //
1190 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
1191 // t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
1192 // t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
1193 //
1194 // Postcondition
1195 // -------------
1196 //   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
1197 //   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
1198 #define FE_CARRY                                                        \
1199     i64 c;                                                              \
1200     c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1201     c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1202     c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
1203     c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
1204     c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
1205     c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
1206     c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
1207     c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
1208     c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1209     c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
1210     c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
1211     c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1212     h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
1213     h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
1214 
fe_frombytes(fe h,const u8 s[32])1215 static void fe_frombytes(fe h, const u8 s[32])
1216 {
1217     i64 t0 =  load32_le(s);                        // t0 < 2^32
1218     i64 t1 =  load24_le(s +  4) << 6;              // t1 < 2^30
1219     i64 t2 =  load24_le(s +  7) << 5;              // t2 < 2^29
1220     i64 t3 =  load24_le(s + 10) << 3;              // t3 < 2^27
1221     i64 t4 =  load24_le(s + 13) << 2;              // t4 < 2^26
1222     i64 t5 =  load32_le(s + 16);                   // t5 < 2^32
1223     i64 t6 =  load24_le(s + 20) << 7;              // t6 < 2^31
1224     i64 t7 =  load24_le(s + 23) << 5;              // t7 < 2^29
1225     i64 t8 =  load24_le(s + 26) << 4;              // t8 < 2^28
1226     i64 t9 = (load24_le(s + 29) & 0x7fffff) << 2;  // t9 < 2^25
1227     FE_CARRY;                                      // Carry recondition OK
1228 }
1229 
1230 // Precondition
1231 //   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
1232 //   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
1233 //
1234 // Therefore, |h| < 2^255-19
1235 // There are two possibilities:
1236 //
1237 // - If h is positive, all we need to do is reduce its individual
1238 //   limbs down to their tight positive range.
1239 // - If h is negative, we also need to add 2^255-19 to it.
1240 //   Or just remove 19 and chop off any excess bit.
fe_tobytes(u8 s[32],const fe h)1241 static void fe_tobytes(u8 s[32], const fe h)
1242 {
1243     i32 t[10];
1244     COPY(t, h, 10);
1245     i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1246     //                 |t9|                    < 1.1 * 2^24
1247     //  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
1248     //  -21  * 2^24  <  19 * t9                < 21  * 2^24
1249     //  -2^29        <  19 * t9 + 2^24         < 2^29
1250     //  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1251     //  -16          < (19 * t9 + 2^24) / 2^25 < 16
1252     FOR (i, 0, 5) {
1253         q += t[2*i  ]; q >>= 26; // q = 0 or -1
1254         q += t[2*i+1]; q >>= 25; // q = 0 or -1
1255     }
1256     // q =  0 iff h >= 0
1257     // q = -1 iff h <  0
1258     // Adding q * 19 to h reduces h to its proper range.
1259     q *= 19;  // Shift carry back to the beginning
1260     FOR (i, 0, 5) {
1261         t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
1262         t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
1263     }
1264     // h is now fully reduced, and q represents the excess bit.
1265 
1266     store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
1267     store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
1268     store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1269     store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
1270     store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
1271     store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
1272     store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1273     store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
1274 
1275     WIPE_BUFFER(t);
1276 }
1277 
1278 // Precondition
1279 // -------------
1280 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1281 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1282 //
1283 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1284 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
fe_mul_small(fe h,const fe f,i32 g)1285 static void fe_mul_small(fe h, const fe f, i32 g)
1286 {
1287     i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
1288     i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
1289     i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
1290     i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
1291     i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
1292     // |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
1293     // |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
1294 
1295     FE_CARRY; // Carry precondition OK
1296 }
1297 
1298 // Precondition
1299 // -------------
1300 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1301 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1302 //
1303 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1304 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
fe_mul(fe h,const fe f,const fe g)1305 static void fe_mul(fe h, const fe f, const fe g)
1306 {
1307     // Everything is unrolled and put in temporary variables.
1308     // We could roll the loop, but that would make curve25519 twice as slow.
1309     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1310     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1311     i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1312     i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1313     i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1314     i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
1315     i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
1316     i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
1317     // |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
1318     // |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
1319     // |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
1320 
1321     i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1322         +    F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1323     i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1324         +    f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1325     i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1326         +    F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1327     i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1328         +    f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1329     i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1330         +    F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1331     i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1332         +    f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1333     i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1334         +    F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1335     i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1336         +    f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1337     i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1338         +    F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1339     i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1340         +    f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1341     // t0 < 0.67 * 2^61
1342     // t1 < 0.41 * 2^61
1343     // t2 < 0.52 * 2^61
1344     // t3 < 0.32 * 2^61
1345     // t4 < 0.38 * 2^61
1346     // t5 < 0.22 * 2^61
1347     // t6 < 0.23 * 2^61
1348     // t7 < 0.13 * 2^61
1349     // t8 < 0.09 * 2^61
1350     // t9 < 0.03 * 2^61
1351 
1352     FE_CARRY; // Everything below 2^62, Carry precondition OK
1353 }
1354 
1355 // Precondition
1356 // -------------
1357 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1358 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1359 //
1360 // Note: we could use fe_mul() for this, but this is significantly faster
fe_sq(fe h,const fe f)1361 static void fe_sq(fe h, const fe f)
1362 {
1363     i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1364     i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1365     i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
1366     i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
1367     i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
1368     i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
1369     // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
1370     // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
1371     // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
1372 
1373     i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1374         +    f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
1375     i64 t1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
1376         +    f4  *(i64)f7_38 + f5_2*(i64)f6_19;
1377     i64 t2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
1378         +    f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
1379     i64 t3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
1380         +    f5_2*(i64)f8_19 + f6  *(i64)f7_38;
1381     i64 t4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
1382         +    f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
1383     i64 t5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
1384         +    f6  *(i64)f9_38 + f7_2*(i64)f8_19;
1385     i64 t6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
1386         +    f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
1387     i64 t7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
1388         +    f3_2*(i64)f4    + f8  *(i64)f9_38;
1389     i64 t8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
1390         +    f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
1391     i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
1392         +    f3_2*(i64)f6    + f4  *(i64)f5_2;
1393     // t0 < 0.67 * 2^61
1394     // t1 < 0.41 * 2^61
1395     // t2 < 0.52 * 2^61
1396     // t3 < 0.32 * 2^61
1397     // t4 < 0.38 * 2^61
1398     // t5 < 0.22 * 2^61
1399     // t6 < 0.23 * 2^61
1400     // t7 < 0.13 * 2^61
1401     // t8 < 0.09 * 2^61
1402     // t9 < 0.03 * 2^61
1403 
1404     FE_CARRY;
1405 }
1406 
1407 // h = 2 * (f^2)
1408 //
1409 // Precondition
1410 // -------------
1411 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1412 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1413 //
1414 // Note: we could implement fe_sq2() by copying fe_sq(), multiplying
1415 // each limb by 2, *then* perform the carry.  This saves one carry.
1416 // However, doing so with the stated preconditions does not work (t2
1417 // would overflow).  There are 3 ways to solve this:
1418 //
1419 // 1. Show that t2 actually never overflows (it really does not).
1420 // 2. Accept an additional carry, at a small lost of performance.
1421 // 3. Make sure the input of fe_sq2() is freshly carried.
1422 //
1423 // SUPERCOP ref10 relies on (1).
1424 // Monocypher chose (2) and (3), mostly to save code.
fe_sq2(fe h,const fe f)1425 static void fe_sq2(fe h, const fe f)
1426 {
1427     fe_sq(h, f);
1428     fe_mul_small(h, h, 2);
1429 }
1430 
1431 // This could be simplified, but it would be slower
fe_pow22523(fe out,const fe z)1432 static void fe_pow22523(fe out, const fe z)
1433 {
1434     fe t0, t1, t2;
1435     fe_sq(t0, z);
1436     fe_sq(t1,t0);                   fe_sq(t1, t1);  fe_mul(t1, z, t1);
1437     fe_mul(t0, t0, t1);
1438     fe_sq(t0, t0);                                  fe_mul(t0, t1, t0);
1439     fe_sq(t1, t0);  FOR (i, 1,   5) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1440     fe_sq(t1, t0);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
1441     fe_sq(t2, t1);  FOR (i, 1,  20) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
1442     fe_sq(t1, t1);  FOR (i, 1,  10) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1443     fe_sq(t1, t0);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t1, t1, t0);
1444     fe_sq(t2, t1);  FOR (i, 1, 100) fe_sq(t2, t2);  fe_mul(t1, t2, t1);
1445     fe_sq(t1, t1);  FOR (i, 1,  50) fe_sq(t1, t1);  fe_mul(t0, t1, t0);
1446     fe_sq(t0, t0);  FOR (i, 1,   2) fe_sq(t0, t0);  fe_mul(out, t0, z);
1447     WIPE_BUFFER(t0);
1448     WIPE_BUFFER(t1);
1449     WIPE_BUFFER(t2);
1450 }
1451 
1452 // Inverting means multiplying by 2^255 - 21
1453 // 2^255 - 21 = (2^252 - 3) * 8 + 3
1454 // So we reuse the multiplication chain of fe_pow22523
fe_invert(fe out,const fe z)1455 static void fe_invert(fe out, const fe z)
1456 {
1457     fe tmp;
1458     fe_pow22523(tmp, z);
1459     // tmp2^8 * z^3
1460     fe_sq(tmp, tmp);                        // 0
1461     fe_sq(tmp, tmp);  fe_mul(tmp, tmp, z);  // 1
1462     fe_sq(tmp, tmp);  fe_mul(out, tmp, z);  // 1
1463     WIPE_BUFFER(tmp);
1464 }
1465 
1466 //  Parity check.  Returns 0 if even, 1 if odd
fe_isodd(const fe f)1467 static int fe_isodd(const fe f)
1468 {
1469     u8 s[32];
1470     fe_tobytes(s, f);
1471     u8 isodd = s[0] & 1;
1472     WIPE_BUFFER(s);
1473     return isodd;
1474 }
1475 
1476 // Returns 1 if equal, 0 if not equal
fe_isequal(const fe f,const fe g)1477 static int fe_isequal(const fe f, const fe g)
1478 {
1479     u8 fs[32];
1480     u8 gs[32];
1481     fe_tobytes(fs, f);
1482     fe_tobytes(gs, g);
1483     int isdifferent = crypto_verify32(fs, gs);
1484     WIPE_BUFFER(fs);
1485     WIPE_BUFFER(gs);
1486     return 1 + isdifferent;
1487 }
1488 
1489 // Inverse square root.
1490 // Returns true if x is a non zero square, false otherwise.
1491 // After the call:
1492 //   isr = sqrt(1/x)        if x is non-zero square.
1493 //   isr = sqrt(sqrt(-1)/x) if x is not a square.
1494 //   isr = 0                if x is zero.
1495 // We do not guarantee the sign of the square root.
1496 //
1497 // Notes:
1498 // Let quartic = x^((p-1)/4)
1499 //
1500 // x^((p-1)/2) = chi(x)
1501 // quartic^2   = chi(x)
1502 // quartic     = sqrt(chi(x))
1503 // quartic     = 1 or -1 or sqrt(-1) or -sqrt(-1)
1504 //
1505 // Note that x is a square if quartic is 1 or -1
1506 // There are 4 cases to consider:
1507 //
1508 // if   quartic         = 1  (x is a square)
1509 // then x^((p-1)/4)     = 1
1510 //      x^((p-5)/4) * x = 1
1511 //      x^((p-5)/4)     = 1/x
1512 //      x^((p-5)/8)     = sqrt(1/x) or -sqrt(1/x)
1513 //
1514 // if   quartic                = -1  (x is a square)
1515 // then x^((p-1)/4)            = -1
1516 //      x^((p-5)/4) * x        = -1
1517 //      x^((p-5)/4)            = -1/x
1518 //      x^((p-5)/8)            = sqrt(-1)   / sqrt(x)
1519 //      x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1520 //      x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1521 //      x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1522 //
1523 // if   quartic         = sqrt(-1)  (x is not a square)
1524 // then x^((p-1)/4)     = sqrt(-1)
1525 //      x^((p-5)/4) * x = sqrt(-1)
1526 //      x^((p-5)/4)     = sqrt(-1)/x
1527 //      x^((p-5)/8)     = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1528 //
1529 // Note that the product of two non-squares is always a square:
1530 //   For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1531 //   Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1532 //   Therefore a*b is a square.
1533 //
1534 //   Since sqrt(-1) and x are both non-squares, their product is a
1535 //   square, and we can compute their square root.
1536 //
1537 // if   quartic                = -sqrt(-1)  (x is not a square)
1538 // then x^((p-1)/4)            = -sqrt(-1)
1539 //      x^((p-5)/4) * x        = -sqrt(-1)
1540 //      x^((p-5)/4)            = -sqrt(-1)/x
1541 //      x^((p-5)/8)            = sqrt(-sqrt(-1)/x)
1542 //      x^((p-5)/8)            = sqrt( sqrt(-1)/x) * sqrt(-1)
1543 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1544 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1545 //      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
invsqrt(fe isr,const fe x)1546 static int invsqrt(fe isr, const fe x)
1547 {
1548     fe check, quartic;
1549     fe_copy(check, x);
1550     fe_pow22523(isr, check);
1551     fe_sq (quartic, isr);
1552     fe_mul(quartic, quartic, check);
1553     fe_1  (check);          int p1 = fe_isequal(quartic, check);
1554     fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
1555     fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
1556     fe_mul(check, isr, sqrtm1);
1557     fe_ccopy(isr, check, m1 | ms);
1558     WIPE_BUFFER(quartic);
1559     WIPE_BUFFER(check);
1560     return p1 | m1;
1561 }
1562 
1563 // trim a scalar for scalar multiplication
trim_scalar(u8 scalar[32])1564 static void trim_scalar(u8 scalar[32])
1565 {
1566     scalar[ 0] &= 248;
1567     scalar[31] &= 127;
1568     scalar[31] |= 64;
1569 }
1570 
1571 // get bit from scalar at position i
scalar_bit(const u8 s[32],int i)1572 static int scalar_bit(const u8 s[32], int i)
1573 {
1574     if (i < 0) { return 0; } // handle -1 for sliding windows
1575     return (s[i>>3] >> (i&7)) & 1;
1576 }
1577 
1578 ///////////////
1579 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1580 ///////////////
scalarmult(u8 q[32],const u8 scalar[32],const u8 p[32],int nb_bits)1581 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1582                        int nb_bits)
1583 {
1584     // computes the scalar product
1585     fe x1;
1586     fe_frombytes(x1, p);
1587 
1588     // computes the actual scalar product (the result is in x2 and z2)
1589     fe x2, z2, x3, z3, t0, t1;
1590     // Montgomery ladder
1591     // In projective coordinates, to avoid divisions: x = X / Z
1592     // We don't care about the y coordinate, it's only 1 bit of information
1593     fe_1(x2);        fe_0(z2); // "zero" point
1594     fe_copy(x3, x1); fe_1(z3); // "one"  point
1595     int swap = 0;
1596     for (int pos = nb_bits-1; pos >= 0; --pos) {
1597         // constant time conditional swap before ladder step
1598         int b = scalar_bit(scalar, pos);
1599         swap ^= b; // xor trick avoids swapping at the end of the loop
1600         fe_cswap(x2, x3, swap);
1601         fe_cswap(z2, z3, swap);
1602         swap = b;  // anticipates one last swap after the loop
1603 
1604         // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1605         // with differential addition
1606         fe_sub(t0, x3, z3);
1607         fe_sub(t1, x2, z2);
1608         fe_add(x2, x2, z2);
1609         fe_add(z2, x3, z3);
1610         fe_mul(z3, t0, x2);
1611         fe_mul(z2, z2, t1);
1612         fe_sq (t0, t1    );
1613         fe_sq (t1, x2    );
1614         fe_add(x3, z3, z2);
1615         fe_sub(z2, z3, z2);
1616         fe_mul(x2, t1, t0);
1617         fe_sub(t1, t1, t0);
1618         fe_sq (z2, z2    );
1619         fe_mul_small(z3, t1, 121666);
1620         fe_sq (x3, x3    );
1621         fe_add(t0, t0, z3);
1622         fe_mul(z3, x1, z2);
1623         fe_mul(z2, t1, t0);
1624     }
1625     // last swap is necessary to compensate for the xor trick
1626     // Note: after this swap, P3 == P2 + P1.
1627     fe_cswap(x2, x3, swap);
1628     fe_cswap(z2, z3, swap);
1629 
1630     // normalises the coordinates: x == X / Z
1631     fe_invert(z2, z2);
1632     fe_mul(x2, x2, z2);
1633     fe_tobytes(q, x2);
1634 
1635     WIPE_BUFFER(x1);
1636     WIPE_BUFFER(x2);  WIPE_BUFFER(z2);  WIPE_BUFFER(t0);
1637     WIPE_BUFFER(x3);  WIPE_BUFFER(z3);  WIPE_BUFFER(t1);
1638 }
1639 
crypto_x25519(u8 raw_shared_secret[32],const u8 your_secret_key[32],const u8 their_public_key[32])1640 void crypto_x25519(u8       raw_shared_secret[32],
1641                    const u8 your_secret_key  [32],
1642                    const u8 their_public_key [32])
1643 {
1644     // restrict the possible scalar values
1645     u8 e[32];
1646     COPY(e, your_secret_key, 32);
1647     trim_scalar(e);
1648     scalarmult(raw_shared_secret, e, their_public_key, 255);
1649     WIPE_BUFFER(e);
1650 }
1651 
crypto_x25519_public_key(u8 public_key[32],const u8 secret_key[32])1652 void crypto_x25519_public_key(u8       public_key[32],
1653                               const u8 secret_key[32])
1654 {
1655     static const u8 base_point[32] = {9};
1656     crypto_x25519(public_key, secret_key, base_point);
1657 }
1658 
1659 ///////////////////////////
1660 /// Arithmetic modulo L ///
1661 ///////////////////////////
1662 static const u32 L[8] = {0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1663                          0x00000000, 0x00000000, 0x00000000, 0x10000000,};
1664 
1665 //  p = a*b + p
multiply(u32 p[16],const u32 a[8],const u32 b[8])1666 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1667 {
1668     FOR (i, 0, 8) {
1669         u64 carry = 0;
1670         FOR (j, 0, 8) {
1671             carry  += p[i+j] + (u64)a[i] * b[j];
1672             p[i+j]  = (u32)carry;
1673             carry >>= 32;
1674         }
1675         p[i+8] = (u32)carry;
1676     }
1677 }
1678 
is_above_l(const u32 x[8])1679 static int is_above_l(const u32 x[8])
1680 {
1681     // We work with L directly, in a 2's complement encoding
1682     // (-L == ~L + 1)
1683     u64 carry = 1;
1684     FOR (i, 0, 8) {
1685         carry += (u64)x[i] + ~L[i];
1686         carry >>= 32;
1687     }
1688     return carry;
1689 }
1690 
1691 // Final reduction modulo L, by conditionally removing L.
1692 // if x < l     , then r = x
1693 // if l <= x 2*l, then r = x-l
1694 // otherwise the result will be wrong
remove_l(u32 r[8],const u32 x[8])1695 static void remove_l(u32 r[8], const u32 x[8])
1696 {
1697     u64 carry = is_above_l(x);
1698     u32 mask  = ~(u32)carry + 1; // carry == 0 or 1
1699     FOR (i, 0, 8) {
1700         carry += (u64)x[i] + (~L[i] & mask);
1701         r[i]   = (u32)carry;
1702         carry >>= 32;
1703     }
1704 }
1705 
1706 // Full reduction modulo L (Barrett reduction)
mod_l(u8 reduced[32],const u32 x[16])1707 static void mod_l(u8 reduced[32], const u32 x[16])
1708 {
1709     static const u32 r[9] = {0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1710                              0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,};
1711     // xr = x * r
1712     u32 xr[25] = {0};
1713     FOR (i, 0, 9) {
1714         u64 carry = 0;
1715         FOR (j, 0, 16) {
1716             carry  += xr[i+j] + (u64)r[i] * x[j];
1717             xr[i+j] = (u32)carry;
1718             carry >>= 32;
1719         }
1720         xr[i+16] = (u32)carry;
1721     }
1722     // xr = floor(xr / 2^512) * L
1723     // Since the result is guaranteed to be below 2*L,
1724     // it is enough to only compute the first 256 bits.
1725     // The division is performed by saying xr[i+16]. (16 * 32 = 512)
1726     ZERO(xr, 8);
1727     FOR (i, 0, 8) {
1728         u64 carry = 0;
1729         FOR (j, 0, 8-i) {
1730             carry   += xr[i+j] + (u64)xr[i+16] * L[j];
1731             xr[i+j] = (u32)carry;
1732             carry >>= 32;
1733         }
1734     }
1735     // xr = x - xr
1736     u64 carry = 1;
1737     FOR (i, 0, 8) {
1738         carry  += (u64)x[i] + ~xr[i];
1739         xr[i]   = (u32)carry;
1740         carry >>= 32;
1741     }
1742     // Final reduction modulo L (conditional subtraction)
1743     remove_l(xr, xr);
1744     store32_le_buf(reduced, xr, 8);
1745 
1746     WIPE_BUFFER(xr);
1747 }
1748 
reduce(u8 r[64])1749 static void reduce(u8 r[64])
1750 {
1751     u32 x[16];
1752     load32_le_buf(x, r, 16);
1753     mod_l(r, x);
1754     WIPE_BUFFER(x);
1755 }
1756 
1757 // r = (a * b) + c
mul_add(u8 r[32],const u8 a[32],const u8 b[32],const u8 c[32])1758 static void mul_add(u8 r[32], const u8 a[32], const u8 b[32], const u8 c[32])
1759 {
1760     u32 A[8];  load32_le_buf(A, a, 8);
1761     u32 B[8];  load32_le_buf(B, b, 8);
1762     u32 p[16];
1763     load32_le_buf(p, c, 8);
1764     ZERO(p + 8, 8);
1765     multiply(p, A, B);
1766     mod_l(r, p);
1767     WIPE_BUFFER(p);
1768     WIPE_BUFFER(A);
1769     WIPE_BUFFER(B);
1770 }
1771 
1772 ///////////////
1773 /// Ed25519 ///
1774 ///////////////
1775 
1776 // Point (group element, ge) in a twisted Edwards curve,
1777 // in extended projective coordinates.
1778 // ge        : x  = X/Z, y  = Y/Z, T  = XY/Z
1779 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1780 // ge_precomp: Z  = 1
1781 typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
1782 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1783 typedef struct { fe Yp; fe Ym;       fe T2; } ge_precomp;
1784 
ge_zero(ge * p)1785 static void ge_zero(ge *p)
1786 {
1787     fe_0(p->X);
1788     fe_1(p->Y);
1789     fe_1(p->Z);
1790     fe_0(p->T);
1791 }
1792 
ge_tobytes(u8 s[32],const ge * h)1793 static void ge_tobytes(u8 s[32], const ge *h)
1794 {
1795     fe recip, x, y;
1796     fe_invert(recip, h->Z);
1797     fe_mul(x, h->X, recip);
1798     fe_mul(y, h->Y, recip);
1799     fe_tobytes(s, y);
1800     s[31] ^= fe_isodd(x) << 7;
1801 
1802     WIPE_BUFFER(recip);
1803     WIPE_BUFFER(x);
1804     WIPE_BUFFER(y);
1805 }
1806 
1807 // h = s, where s is a point encoded in 32 bytes
1808 //
1809 // Variable time!  Inputs must not be secret!
1810 // => Use only to *check* signatures.
1811 //
1812 // From the specifications:
1813 //   The encoding of s contains y and the sign of x
1814 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1815 // In extended coordinates:
1816 //   X = x, Y = y, Z = 1, T = x*y
1817 //
1818 //    Note that num * den is a square iff num / den is a square
1819 //    If num * den is not a square, the point was not on the curve.
1820 // From the above:
1821 //   Let num =   y^2 - 1
1822 //   Let den = d*y^2 + 1
1823 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1824 //   x = sqrt(num / den)
1825 //   x = sqrt(num^2 / (num * den))
1826 //   x = num * sqrt(1 / (num * den))
1827 //
1828 // Therefore, we can just compute:
1829 //   num =   y^2 - 1
1830 //   den = d*y^2 + 1
1831 //   isr = invsqrt(num * den)  // abort if not square
1832 //   x   = num * isr
1833 // Finally, negate x if its sign is not as specified.
ge_frombytes_vartime(ge * h,const u8 s[32])1834 static int ge_frombytes_vartime(ge *h, const u8 s[32])
1835 {
1836     fe_frombytes(h->Y, s);
1837     fe_1(h->Z);
1838     fe_sq (h->T, h->Y);        // t =   y^2
1839     fe_mul(h->X, h->T, d   );  // x = d*y^2
1840     fe_sub(h->T, h->T, h->Z);  // t =   y^2 - 1
1841     fe_add(h->X, h->X, h->Z);  // x = d*y^2 + 1
1842     fe_mul(h->X, h->T, h->X);  // x = (y^2 - 1) * (d*y^2 + 1)
1843     int is_square = invsqrt(h->X, h->X);
1844     if (!is_square) {
1845         return -1;             // Not on the curve, abort
1846     }
1847     fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1848     if (fe_isodd(h->X) != (s[31] >> 7)) {
1849         fe_neg(h->X, h->X);
1850     }
1851     fe_mul(h->T, h->X, h->Y);
1852     return 0;
1853 }
1854 
ge_cache(ge_cached * c,const ge * p)1855 static void ge_cache(ge_cached *c, const ge *p)
1856 {
1857     fe_add (c->Yp, p->Y, p->X);
1858     fe_sub (c->Ym, p->Y, p->X);
1859     fe_copy(c->Z , p->Z      );
1860     fe_mul (c->T2, p->T, D2  );
1861 }
1862 
1863 // Internal buffers are not wiped! Inputs must not be secret!
1864 // => Use only to *check* signatures.
ge_add(ge * s,const ge * p,const ge_cached * q)1865 static void ge_add(ge *s, const ge *p, const ge_cached *q)
1866 {
1867     fe a, b;
1868     fe_add(a   , p->Y, p->X );
1869     fe_sub(b   , p->Y, p->X );
1870     fe_mul(a   , a   , q->Yp);
1871     fe_mul(b   , b   , q->Ym);
1872     fe_add(s->Y, a   , b    );
1873     fe_sub(s->X, a   , b    );
1874 
1875     fe_add(s->Z, p->Z, p->Z );
1876     fe_mul(s->Z, s->Z, q->Z );
1877     fe_mul(s->T, p->T, q->T2);
1878     fe_add(a   , s->Z, s->T );
1879     fe_sub(b   , s->Z, s->T );
1880 
1881     fe_mul(s->T, s->X, s->Y);
1882     fe_mul(s->X, s->X, b   );
1883     fe_mul(s->Y, s->Y, a   );
1884     fe_mul(s->Z, a   , b   );
1885 }
1886 
1887 // Internal buffers are not wiped! Inputs must not be secret!
1888 // => Use only to *check* signatures.
ge_sub(ge * s,const ge * p,const ge_cached * q)1889 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1890 {
1891     ge_cached neg;
1892     fe_copy(neg.Ym, q->Yp);
1893     fe_copy(neg.Yp, q->Ym);
1894     fe_copy(neg.Z , q->Z );
1895     fe_neg (neg.T2, q->T2);
1896     ge_add(s, p, &neg);
1897 }
1898 
ge_madd(ge * s,const ge * p,const ge_precomp * q,fe a,fe b)1899 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1900 {
1901     fe_add(a   , p->Y, p->X );
1902     fe_sub(b   , p->Y, p->X );
1903     fe_mul(a   , a   , q->Yp);
1904     fe_mul(b   , b   , q->Ym);
1905     fe_add(s->Y, a   , b    );
1906     fe_sub(s->X, a   , b    );
1907 
1908     fe_add(s->Z, p->Z, p->Z );
1909     fe_mul(s->T, p->T, q->T2);
1910     fe_add(a   , s->Z, s->T );
1911     fe_sub(b   , s->Z, s->T );
1912 
1913     fe_mul(s->T, s->X, s->Y);
1914     fe_mul(s->X, s->X, b   );
1915     fe_mul(s->Y, s->Y, a   );
1916     fe_mul(s->Z, a   , b   );
1917 }
1918 
ge_msub(ge * s,const ge * p,const ge_precomp * q,fe a,fe b)1919 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1920 {
1921     fe_add(a   , p->Y, p->X );
1922     fe_sub(b   , p->Y, p->X );
1923     fe_mul(a   , a   , q->Ym);
1924     fe_mul(b   , b   , q->Yp);
1925     fe_add(s->Y, a   , b    );
1926     fe_sub(s->X, a   , b    );
1927 
1928     fe_add(s->Z, p->Z, p->Z );
1929     fe_mul(s->T, p->T, q->T2);
1930     fe_sub(a   , s->Z, s->T );
1931     fe_add(b   , s->Z, s->T );
1932 
1933     fe_mul(s->T, s->X, s->Y);
1934     fe_mul(s->X, s->X, b   );
1935     fe_mul(s->Y, s->Y, a   );
1936     fe_mul(s->Z, a   , b   );
1937 }
1938 
ge_double(ge * s,const ge * p,ge * q)1939 static void ge_double(ge *s, const ge *p, ge *q)
1940 {
1941     fe_sq (q->X, p->X);
1942     fe_sq (q->Y, p->Y);
1943     fe_sq2(q->Z, p->Z);
1944     fe_add(q->T, p->X, p->Y);
1945     fe_sq (s->T, q->T);
1946     fe_add(q->T, q->Y, q->X);
1947     fe_sub(q->Y, q->Y, q->X);
1948     fe_sub(q->X, s->T, q->T);
1949     fe_sub(q->Z, q->Z, q->Y);
1950 
1951     fe_mul(s->X, q->X , q->Z);
1952     fe_mul(s->Y, q->T , q->Y);
1953     fe_mul(s->Z, q->Y , q->Z);
1954     fe_mul(s->T, q->X , q->T);
1955 }
1956 
1957 // 5-bit signed window in cached format (Niels coordinates, Z=1)
1958 static const ge_precomp b_window[8] = {
1959     {{25967493,-14356035,29566456,3660896,-12694345,
1960       4014787,27544626,-11754271,-6079156,2047605,},
1961      {-12545711,934262,-2722910,3049990,-727428,
1962       9406986,12720692,5043384,19500929,-15469378,},
1963      {-8738181,4489570,9688441,-14785194,10184609,
1964       -12363380,29287919,11864899,-24514362,-4438546,},},
1965     {{15636291,-9688557,24204773,-7912398,616977,
1966       -16685262,27787600,-14772189,28944400,-1550024,},
1967      {16568933,4717097,-11556148,-1102322,15682896,
1968       -11807043,16354577,-11775962,7689662,11199574,},
1969      {30464156,-5976125,-11779434,-15670865,23220365,
1970       15915852,7512774,10017326,-17749093,-9920357,},},
1971     {{10861363,11473154,27284546,1981175,-30064349,
1972       12577861,32867885,14515107,-15438304,10819380,},
1973      {4708026,6336745,20377586,9066809,-11272109,
1974       6594696,-25653668,12483688,-12668491,5581306,},
1975      {19563160,16186464,-29386857,4097519,10237984,
1976       -4348115,28542350,13850243,-23678021,-15815942,},},
1977     {{5153746,9909285,1723747,-2777874,30523605,
1978       5516873,19480852,5230134,-23952439,-15175766,},
1979      {-30269007,-3463509,7665486,10083793,28475525,
1980       1649722,20654025,16520125,30598449,7715701,},
1981      {28881845,14381568,9657904,3680757,-20181635,
1982       7843316,-31400660,1370708,29794553,-1409300,},},
1983     {{-22518993,-6692182,14201702,-8745502,-23510406,
1984       8844726,18474211,-1361450,-13062696,13821877,},
1985      {-6455177,-7839871,3374702,-4740862,-27098617,
1986       -10571707,31655028,-7212327,18853322,-14220951,},
1987      {4566830,-12963868,-28974889,-12240689,-7602672,
1988       -2830569,-8514358,-10431137,2207753,-3209784,},},
1989     {{-25154831,-4185821,29681144,7868801,-6854661,
1990       -9423865,-12437364,-663000,-31111463,-16132436,},
1991      {25576264,-2703214,7349804,-11814844,16472782,
1992       9300885,3844789,15725684,171356,6466918,},
1993      {23103977,13316479,9739013,-16149481,817875,
1994       -15038942,8965339,-14088058,-30714912,16193877,},},
1995     {{-33521811,3180713,-2394130,14003687,-16903474,
1996       -16270840,17238398,4729455,-18074513,9256800,},
1997      {-25182317,-4174131,32336398,5036987,-21236817,
1998       11360617,22616405,9761698,-19827198,630305,},
1999      {-13720693,2639453,-24237460,-7406481,9494427,
2000       -5774029,-6554551,-15960994,-2449256,-14291300,},},
2001     {{-3151181,-5046075,9282714,6866145,-31907062,
2002       -863023,-18940575,15033784,25105118,-7894876,},
2003      {-24326370,15950226,-31801215,-14592823,-11662737,
2004       -5090925,1573892,-2625887,2198790,-15804619,},
2005      {-3099351,10324967,-2241613,7453183,-5446979,
2006       -2735503,-13812022,-16236442,-32461234,-12290683,},},
2007 };
2008 
2009 // Incremental sliding windows (left to right)
2010 // Based on Roberto Maria Avanzi[2005]
2011 typedef struct {
2012     i16 next_index; // position of the next signed digit
2013     i8  next_digit; // next signed digit (odd number below 2^window_width)
2014     u8  next_check; // point at which we must check for a new window
2015 } slide_ctx;
2016 
slide_init(slide_ctx * ctx,const u8 scalar[32])2017 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
2018 {
2019     // scalar is guaranteed to be below L, either because we checked (s),
2020     // or because we reduced it modulo L (h_ram). L is under 2^253, so
2021     // so bits 253 to 255 are guaranteed to be zero. No need to test them.
2022     //
2023     // Note however that L is very close to 2^252, so bit 252 is almost
2024     // always zero.  If we were to start at bit 251, the tests wouldn't
2025     // catch the off-by-one error (constructing one that does would be
2026     // prohibitively expensive).
2027     //
2028     // We should still check bit 252, though.
2029     int i = 252;
2030     while (i > 0 && scalar_bit(scalar, i) == 0) {
2031         i--;
2032     }
2033     ctx->next_check = (u8)(i + 1);
2034     ctx->next_index = -1;
2035     ctx->next_digit = -1;
2036 }
2037 
slide_step(slide_ctx * ctx,int width,int i,const u8 scalar[32])2038 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
2039 {
2040     if (i == ctx->next_check) {
2041         if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
2042             ctx->next_check--;
2043         } else {
2044             // compute digit of next window
2045             int w = MIN(width, i + 1);
2046             int v = -(scalar_bit(scalar, i) << (w-1));
2047             FOR_T (int, j, 0, w-1) {
2048                 v += scalar_bit(scalar, i-(w-1)+j) << j;
2049             }
2050             v += scalar_bit(scalar, i-w);
2051             int lsb = v & (~v + 1);            // smallest bit of v
2052             int s   = (   ((lsb & 0xAA) != 0)  // log2(lsb)
2053                        | (((lsb & 0xCC) != 0) << 1)
2054                        | (((lsb & 0xF0) != 0) << 2));
2055             ctx->next_index  = (i16)(i-(w-1)+s);
2056             ctx->next_digit  = (i8) (v >> s   );
2057             ctx->next_check -= (u8) w;
2058         }
2059     }
2060     return i == ctx->next_index ? ctx->next_digit: 0;
2061 }
2062 
2063 #define P_W_WIDTH 3 // Affects the size of the stack
2064 #define B_W_WIDTH 5 // Affects the size of the binary
2065 #define P_W_SIZE  (1<<(P_W_WIDTH-2))
2066 
2067 // P = [b]B + [p]P, where B is the base point
2068 //
2069 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
2070 // => Use only to *check* signatures.
ge_double_scalarmult_vartime(ge * P,const u8 p[32],const u8 b[32])2071 static void ge_double_scalarmult_vartime(ge *P, const u8 p[32], const u8 b[32])
2072 {
2073     // cache P window for addition
2074     ge_cached cP[P_W_SIZE];
2075     {
2076         ge P2, tmp;
2077         ge_double(&P2, P, &tmp);
2078         ge_cache(&cP[0], P);
2079         FOR (i, 1, P_W_SIZE) {
2080             ge_add(&tmp, &P2, &cP[i-1]);
2081             ge_cache(&cP[i], &tmp);
2082         }
2083     }
2084 
2085     // Merged double and add ladder, fused with sliding
2086     slide_ctx p_slide;  slide_init(&p_slide, p);
2087     slide_ctx b_slide;  slide_init(&b_slide, b);
2088     int i = MAX(p_slide.next_check, b_slide.next_check);
2089     ge *sum = P;
2090     ge_zero(sum);
2091     while (i >= 0) {
2092         ge tmp;
2093         ge_double(sum, sum, &tmp);
2094         int p_digit = slide_step(&p_slide, P_W_WIDTH, i, p);
2095         int b_digit = slide_step(&b_slide, B_W_WIDTH, i, b);
2096         if (p_digit > 0) { ge_add(sum, sum, &cP[ p_digit / 2]); }
2097         if (p_digit < 0) { ge_sub(sum, sum, &cP[-p_digit / 2]); }
2098         fe t1, t2;
2099         if (b_digit > 0) { ge_madd(sum, sum, b_window +  b_digit/2, t1, t2); }
2100         if (b_digit < 0) { ge_msub(sum, sum, b_window + -b_digit/2, t1, t2); }
2101         i--;
2102     }
2103 }
2104 
2105 // R_check = s[B] - h_ram[pk], where B is the base point
2106 //
2107 // Variable time! Internal buffers are not wiped! Inputs must not be secret!
2108 // => Use only to *check* signatures.
ge_r_check(u8 R_check[32],u8 s[32],u8 h_ram[32],u8 pk[32])2109 static int ge_r_check(u8 R_check[32], u8 s[32], u8 h_ram[32], u8 pk[32])
2110 {
2111     ge  A;      // not secret, not wiped
2112     u32 s32[8]; // not secret, not wiped
2113     load32_le_buf(s32, s, 8);
2114     if (ge_frombytes_vartime(&A, pk) ||         // A = pk
2115         is_above_l(s32)) {                      // prevent s malleability
2116         return -1;
2117     }
2118     fe_neg(A.X, A.X);
2119     fe_neg(A.T, A.T);                           // A = -pk
2120     ge_double_scalarmult_vartime(&A, h_ram, s); // A = [s]B - [h_ram]pk
2121     ge_tobytes(R_check, &A);                    // R_check = A
2122     return 0;
2123 }
2124 
2125 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
2126 static const ge_precomp b_comb_low[8] = {
2127     {{-6816601,-2324159,-22559413,124364,18015490,
2128       8373481,19993724,1979872,-18549925,9085059,},
2129      {10306321,403248,14839893,9633706,8463310,
2130       -8354981,-14305673,14668847,26301366,2818560,},
2131      {-22701500,-3210264,-13831292,-2927732,-16326337,
2132       -14016360,12940910,177905,12165515,-2397893,},},
2133     {{-12282262,-7022066,9920413,-3064358,-32147467,
2134       2927790,22392436,-14852487,2719975,16402117,},
2135      {-7236961,-4729776,2685954,-6525055,-24242706,
2136       -15940211,-6238521,14082855,10047669,12228189,},
2137      {-30495588,-12893761,-11161261,3539405,-11502464,
2138       16491580,-27286798,-15030530,-7272871,-15934455,},},
2139     {{17650926,582297,-860412,-187745,-12072900,
2140       -10683391,-20352381,15557840,-31072141,-5019061,},
2141      {-6283632,-2259834,-4674247,-4598977,-4089240,
2142       12435688,-31278303,1060251,6256175,10480726,},
2143      {-13871026,2026300,-21928428,-2741605,-2406664,
2144       -8034988,7355518,15733500,-23379862,7489131,},},
2145     {{6883359,695140,23196907,9644202,-33430614,
2146       11354760,-20134606,6388313,-8263585,-8491918,},
2147      {-7716174,-13605463,-13646110,14757414,-19430591,
2148       -14967316,10359532,-11059670,-21935259,12082603,},
2149      {-11253345,-15943946,10046784,5414629,24840771,
2150       8086951,-6694742,9868723,15842692,-16224787,},},
2151     {{9639399,11810955,-24007778,-9320054,3912937,
2152       -9856959,996125,-8727907,-8919186,-14097242,},
2153      {7248867,14468564,25228636,-8795035,14346339,
2154       8224790,6388427,-7181107,6468218,-8720783,},
2155      {15513115,15439095,7342322,-10157390,18005294,
2156       -7265713,2186239,4884640,10826567,7135781,},},
2157     {{-14204238,5297536,-5862318,-6004934,28095835,
2158       4236101,-14203318,1958636,-16816875,3837147,},
2159      {-5511166,-13176782,-29588215,12339465,15325758,
2160       -15945770,-8813185,11075932,-19608050,-3776283,},
2161      {11728032,9603156,-4637821,-5304487,-7827751,
2162       2724948,31236191,-16760175,-7268616,14799772,},},
2163     {{-28842672,4840636,-12047946,-9101456,-1445464,
2164       381905,-30977094,-16523389,1290540,12798615,},
2165      {27246947,-10320914,14792098,-14518944,5302070,
2166       -8746152,-3403974,-4149637,-27061213,10749585,},
2167      {25572375,-6270368,-15353037,16037944,1146292,
2168       32198,23487090,9585613,24714571,-1418265,},},
2169     {{19844825,282124,-17583147,11004019,-32004269,
2170       -2716035,6105106,-1711007,-21010044,14338445,},
2171      {8027505,8191102,-18504907,-12335737,25173494,
2172       -5923905,15446145,7483684,-30440441,10009108,},
2173      {-14134701,-4174411,10246585,-14677495,33553567,
2174       -14012935,23366126,15080531,-7969992,7663473,},},
2175 };
2176 
2177 static const ge_precomp b_comb_high[8] = {
2178     {{33055887,-4431773,-521787,6654165,951411,
2179       -6266464,-5158124,6995613,-5397442,-6985227,},
2180      {4014062,6967095,-11977872,3960002,8001989,
2181       5130302,-2154812,-1899602,-31954493,-16173976,},
2182      {16271757,-9212948,23792794,731486,-25808309,
2183       -3546396,6964344,-4767590,10976593,10050757,},},
2184     {{2533007,-4288439,-24467768,-12387405,-13450051,
2185       14542280,12876301,13893535,15067764,8594792,},
2186      {20073501,-11623621,3165391,-13119866,13188608,
2187       -11540496,-10751437,-13482671,29588810,2197295,},
2188      {-1084082,11831693,6031797,14062724,14748428,
2189       -8159962,-20721760,11742548,31368706,13161200,},},
2190     {{2050412,-6457589,15321215,5273360,25484180,
2191       124590,-18187548,-7097255,-6691621,-14604792,},
2192      {9938196,2162889,-6158074,-1711248,4278932,
2193       -2598531,-22865792,-7168500,-24323168,11746309,},
2194      {-22691768,-14268164,5965485,9383325,20443693,
2195       5854192,28250679,-1381811,-10837134,13717818,},},
2196     {{-8495530,16382250,9548884,-4971523,-4491811,
2197       -3902147,6182256,-12832479,26628081,10395408,},
2198      {27329048,-15853735,7715764,8717446,-9215518,
2199       -14633480,28982250,-5668414,4227628,242148,},
2200      {-13279943,-7986904,-7100016,8764468,-27276630,
2201       3096719,29678419,-9141299,3906709,11265498,},},
2202     {{11918285,15686328,-17757323,-11217300,-27548967,
2203       4853165,-27168827,6807359,6871949,-1075745,},
2204      {-29002610,13984323,-27111812,-2713442,28107359,
2205       -13266203,6155126,15104658,3538727,-7513788,},
2206      {14103158,11233913,-33165269,9279850,31014152,
2207       4335090,-1827936,4590951,13960841,12787712,},},
2208     {{1469134,-16738009,33411928,13942824,8092558,
2209       -8778224,-11165065,1437842,22521552,-2792954,},
2210      {31352705,-4807352,-25327300,3962447,12541566,
2211       -9399651,-27425693,7964818,-23829869,5541287,},
2212      {-25732021,-6864887,23848984,3039395,-9147354,
2213       6022816,-27421653,10590137,25309915,-1584678,},},
2214     {{-22951376,5048948,31139401,-190316,-19542447,
2215       -626310,-17486305,-16511925,-18851313,-12985140,},
2216      {-9684890,14681754,30487568,7717771,-10829709,
2217       9630497,30290549,-10531496,-27798994,-13812825,},
2218      {5827835,16097107,-24501327,12094619,7413972,
2219       11447087,28057551,-1793987,-14056981,4359312,},},
2220     {{26323183,2342588,-21887793,-1623758,-6062284,
2221       2107090,-28724907,9036464,-19618351,-13055189,},
2222      {-29697200,14829398,-4596333,14220089,-30022969,
2223       2955645,12094100,-13693652,-5941445,7047569,},
2224      {-3201977,14413268,-12058324,-16417589,-9035655,
2225       -7224648,9258160,1399236,30397584,-5684634,},},
2226 };
2227 
lookup_add(ge * p,ge_precomp * tmp_c,fe tmp_a,fe tmp_b,const ge_precomp comb[8],const u8 scalar[32],int i)2228 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2229                        const ge_precomp comb[8], const u8 scalar[32], int i)
2230 {
2231     u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
2232                     (scalar_bit(scalar, i + 32) << 1) +
2233                     (scalar_bit(scalar, i + 64) << 2) +
2234                     (scalar_bit(scalar, i + 96) << 3));
2235     u8 high  = teeth >> 3;
2236     u8 index = (teeth ^ (high - 1)) & 7;
2237     FOR (j, 0, 8) {
2238         i32 select = 1 & (((j ^ index) - 1) >> 8);
2239         fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2240         fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2241         fe_ccopy(tmp_c->T2, comb[j].T2, select);
2242     }
2243     fe_neg(tmp_a, tmp_c->T2);
2244     fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
2245     fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2246     ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2247 }
2248 
2249 // p = [scalar]B, where B is the base point
ge_scalarmult_base(ge * p,const u8 scalar[32])2250 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2251 {
2252     // twin 4-bits signed combs, from Mike Hamburg's
2253     // Fast and compact elliptic-curve cryptography (2012)
2254     // 1 / 2 modulo L
2255     static const u8 half_mod_L[32] = {
2256         247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2257         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8, };
2258     // (2^256 - 1) / 2 modulo L
2259     static const u8 half_ones[32] = {
2260         142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2261         255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7, };
2262 
2263     // All bits set form: 1 means 1, 0 means -1
2264     u8 s_scalar[32];
2265     mul_add(s_scalar, scalar, half_mod_L, half_ones);
2266 
2267     // Double and add ladder
2268     fe tmp_a, tmp_b;  // temporaries for addition
2269     ge_precomp tmp_c; // temporary for comb lookup
2270     ge tmp_d;         // temporary for doubling
2271     fe_1(tmp_c.Yp);
2272     fe_1(tmp_c.Ym);
2273     fe_0(tmp_c.T2);
2274 
2275     // Save a double on the first iteration
2276     ge_zero(p);
2277     lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2278     lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2279     // Regular double & add for the rest
2280     for (int i = 30; i >= 0; i--) {
2281         ge_double(p, p, &tmp_d);
2282         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2283         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2284     }
2285     // Note: we could save one addition at the end if we assumed the
2286     // scalar fit in 252 bit.  Which it does in practice if it is
2287     // selected at random.  However, non-random, non-hashed scalars
2288     // *can* overflow 252 bits in practice.  Better account for that
2289     // than leaving that kind of subtle corner case.
2290 
2291     WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
2292     WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
2293     WIPE_BUFFER(s_scalar);
2294 }
2295 
crypto_sign_public_key_custom_hash(u8 public_key[32],const u8 secret_key[32],const crypto_sign_vtable * hash)2296 void crypto_sign_public_key_custom_hash(u8       public_key[32],
2297                                         const u8 secret_key[32],
2298                                         const crypto_sign_vtable *hash)
2299 {
2300     u8 a[64];
2301     hash->hash(a, secret_key, 32);
2302     trim_scalar(a);
2303     ge A;
2304     ge_scalarmult_base(&A, a);
2305     ge_tobytes(public_key, &A);
2306     WIPE_BUFFER(a);
2307     WIPE_CTX(&A);
2308 }
2309 
crypto_sign_public_key(u8 public_key[32],const u8 secret_key[32])2310 void crypto_sign_public_key(u8 public_key[32], const u8 secret_key[32])
2311 {
2312     crypto_sign_public_key_custom_hash(public_key, secret_key,
2313                                        &crypto_blake2b_vtable);
2314 }
2315 
crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract * ctx,const u8 secret_key[32],const u8 public_key[32],const crypto_sign_vtable * hash)2316 void crypto_sign_init_first_pass_custom_hash(crypto_sign_ctx_abstract *ctx,
2317                                              const u8 secret_key[32],
2318                                              const u8 public_key[32],
2319                                              const crypto_sign_vtable *hash)
2320 {
2321     ctx->hash  = hash; // set vtable
2322     u8 *a      = ctx->buf;
2323     u8 *prefix = ctx->buf + 32;
2324     ctx->hash->hash(a, secret_key, 32);
2325     trim_scalar(a);
2326 
2327     if (public_key == 0) {
2328         crypto_sign_public_key_custom_hash(ctx->pk, secret_key, ctx->hash);
2329     } else {
2330         COPY(ctx->pk, public_key, 32);
2331     }
2332 
2333     // Deterministic part of EdDSA: Construct a nonce by hashing the message
2334     // instead of generating a random number.
2335     // An actual random number would work just fine, and would save us
2336     // the trouble of hashing the message twice.  If we did that
2337     // however, the user could fuck it up and reuse the nonce.
2338     ctx->hash->init  (ctx);
2339     ctx->hash->update(ctx, prefix , 32);
2340 }
2341 
crypto_sign_init_first_pass(crypto_sign_ctx_abstract * ctx,const u8 secret_key[32],const u8 public_key[32])2342 void crypto_sign_init_first_pass(crypto_sign_ctx_abstract *ctx,
2343                                  const u8 secret_key[32],
2344                                  const u8 public_key[32])
2345 {
2346     crypto_sign_init_first_pass_custom_hash(ctx, secret_key, public_key,
2347                                             &crypto_blake2b_vtable);
2348 }
2349 
crypto_sign_update(crypto_sign_ctx_abstract * ctx,const u8 * msg,size_t msg_size)2350 void crypto_sign_update(crypto_sign_ctx_abstract *ctx,
2351                         const u8 *msg, size_t msg_size)
2352 {
2353     ctx->hash->update(ctx, msg, msg_size);
2354 }
2355 
crypto_sign_init_second_pass(crypto_sign_ctx_abstract * ctx)2356 void crypto_sign_init_second_pass(crypto_sign_ctx_abstract *ctx)
2357 {
2358     u8 *r        = ctx->buf + 32;
2359     u8 *half_sig = ctx->buf + 64;
2360     ctx->hash->final(ctx, r);
2361     reduce(r);
2362 
2363     // first half of the signature = "random" nonce times the base point
2364     ge R;
2365     ge_scalarmult_base(&R, r);
2366     ge_tobytes(half_sig, &R);
2367     WIPE_CTX(&R);
2368 
2369     // Hash R, the public key, and the message together.
2370     // It cannot be done in parallel with the first hash.
2371     ctx->hash->init  (ctx);
2372     ctx->hash->update(ctx, half_sig, 32);
2373     ctx->hash->update(ctx, ctx->pk , 32);
2374 }
2375 
crypto_sign_final(crypto_sign_ctx_abstract * ctx,u8 signature[64])2376 void crypto_sign_final(crypto_sign_ctx_abstract *ctx, u8 signature[64])
2377 {
2378     u8 *a        = ctx->buf;
2379     u8 *r        = ctx->buf + 32;
2380     u8 *half_sig = ctx->buf + 64;
2381     u8  h_ram[64];
2382     ctx->hash->final(ctx, h_ram);
2383     reduce(h_ram);
2384     COPY(signature, half_sig, 32);
2385     mul_add(signature + 32, h_ram, a, r); // s = h_ram * a + r
2386     WIPE_BUFFER(h_ram);
2387     crypto_wipe(ctx, ctx->hash->ctx_size);
2388 }
2389 
crypto_sign(u8 signature[64],const u8 secret_key[32],const u8 public_key[32],const u8 * message,size_t message_size)2390 void crypto_sign(u8        signature[64],
2391                  const u8  secret_key[32],
2392                  const u8  public_key[32],
2393                  const u8 *message, size_t message_size)
2394 {
2395     crypto_sign_ctx ctx;
2396     crypto_sign_ctx_abstract *actx = (crypto_sign_ctx_abstract*)&ctx;
2397     crypto_sign_init_first_pass (actx, secret_key, public_key);
2398     crypto_sign_update          (actx, message, message_size);
2399     crypto_sign_init_second_pass(actx);
2400     crypto_sign_update          (actx, message, message_size);
2401     crypto_sign_final           (actx, signature);
2402 }
2403 
crypto_check_init_custom_hash(crypto_check_ctx_abstract * ctx,const u8 signature[64],const u8 public_key[32],const crypto_sign_vtable * hash)2404 void crypto_check_init_custom_hash(crypto_check_ctx_abstract *ctx,
2405                                    const u8 signature[64],
2406                                    const u8 public_key[32],
2407                                    const crypto_sign_vtable *hash)
2408 {
2409     ctx->hash = hash; // set vtable
2410     COPY(ctx->buf, signature , 64);
2411     COPY(ctx->pk , public_key, 32);
2412     ctx->hash->init  (ctx);
2413     ctx->hash->update(ctx, signature , 32);
2414     ctx->hash->update(ctx, public_key, 32);
2415 }
2416 
crypto_check_init(crypto_check_ctx_abstract * ctx,const u8 signature[64],const u8 public_key[32])2417 void crypto_check_init(crypto_check_ctx_abstract *ctx, const u8 signature[64],
2418                        const u8 public_key[32])
2419 {
2420     crypto_check_init_custom_hash(ctx, signature, public_key,
2421                                   &crypto_blake2b_vtable);
2422 }
2423 
crypto_check_update(crypto_check_ctx_abstract * ctx,const u8 * msg,size_t msg_size)2424 void crypto_check_update(crypto_check_ctx_abstract *ctx,
2425                          const u8 *msg, size_t msg_size)
2426 {
2427     ctx->hash->update(ctx, msg, msg_size);
2428 }
2429 
crypto_check_final(crypto_check_ctx_abstract * ctx)2430 int crypto_check_final(crypto_check_ctx_abstract *ctx)
2431 {
2432     u8 h_ram[64];
2433     ctx->hash->final(ctx, h_ram);
2434     reduce(h_ram);
2435     u8 *R       = ctx->buf;      // R
2436     u8 *s       = ctx->buf + 32; // s
2437     u8 *R_check = ctx->pk;       // overwrite ctx->pk to save stack space
2438     if (ge_r_check(R_check, s, h_ram, ctx->pk)) {
2439         return -1;
2440     }
2441     return crypto_verify32(R, R_check); // R == R_check ? OK : fail
2442 }
2443 
crypto_check(const u8 signature[64],const u8 public_key[32],const u8 * message,size_t message_size)2444 int crypto_check(const u8  signature[64], const u8 public_key[32],
2445                  const u8 *message, size_t message_size)
2446 {
2447     crypto_check_ctx ctx;
2448     crypto_check_ctx_abstract *actx = (crypto_check_ctx_abstract*)&ctx;
2449     crypto_check_init  (actx, signature, public_key);
2450     crypto_check_update(actx, message, message_size);
2451     return crypto_check_final(actx);
2452 }
2453 
2454 ///////////////////////
2455 /// EdDSA to X25519 ///
2456 ///////////////////////
crypto_from_eddsa_private(u8 x25519[32],const u8 eddsa[32])2457 void crypto_from_eddsa_private(u8 x25519[32], const u8 eddsa[32])
2458 {
2459     u8 a[64];
2460     crypto_blake2b(a, eddsa, 32);
2461     COPY(x25519, a, 32);
2462     WIPE_BUFFER(a);
2463 }
2464 
crypto_from_eddsa_public(u8 x25519[32],const u8 eddsa[32])2465 void crypto_from_eddsa_public(u8 x25519[32], const u8 eddsa[32])
2466 {
2467     fe t1, t2;
2468     fe_frombytes(t2, eddsa);
2469     fe_add(t1, fe_one, t2);
2470     fe_sub(t2, fe_one, t2);
2471     fe_invert(t2, t2);
2472     fe_mul(t1, t1, t2);
2473     fe_tobytes(x25519, t1);
2474     WIPE_BUFFER(t1);
2475     WIPE_BUFFER(t2);
2476 }
2477 
2478 /////////////////////////////////////////////
2479 /// Dirty ephemeral public key generation ///
2480 /////////////////////////////////////////////
2481 
2482 // Those functions generates a public key, *without* clearing the
2483 // cofactor.  Sending that key over the network leaks 3 bits of the
2484 // private key.  Use only to generate ephemeral keys that will be hidden
2485 // with crypto_curve_to_hidden().
2486 //
2487 // The public key is otherwise compatible with crypto_x25519() and
2488 // crypto_key_exchange() (those properly clear the cofactor).
2489 //
2490 // Note that the distribution of the resulting public keys is almost
2491 // uniform.  Flipping the sign of the v coordinate (not provided by this
2492 // function), covers the entire key space almost perfectly, where
2493 // "almost" means a 2^-128 bias (undetectable).  This uniformity is
2494 // needed to ensure the proper randomness of the resulting
2495 // representatives (once we apply crypto_curve_to_hidden()).
2496 //
2497 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2498 // to be confused with the prime order of the main subgroup, L, which is
2499 // 8 times less than that).
2500 //
2501 // Generating all points would require us to multiply a point of order C
2502 // (the base point plus any point of order 8) by all scalars from 0 to
2503 // C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2504 // by negating the resulting point at random, we also cover scalars from
2505 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2506 //
2507 // In practice:
2508 // - Scalars from 0         to e + 1     are never generated
2509 // - Scalars from 2^255     to 2^255 + e are never generated
2510 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2511 //
2512 // Since e < 2^128, detecting this bias requires observing over 2^100
2513 // representatives from a given source (this will never happen), *and*
2514 // recovering enough of the private key to determine that they do, or do
2515 // not, belong to the biased set (this practically requires solving
2516 // discrete logarithm, which is conjecturally intractable).
2517 //
2518 // In practice, this means the bias is impossible to detect.
2519 
2520 // s + (x*L) % 8*L
2521 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
2522 //   L             < 2^253
2523 //   x%8           < 2^3
2524 //   L * (x%8)     < 2^255
2525 //   s             < 2^255
2526 //   s + L * (x%8) < 2^256
add_xl(u8 s[32],u8 x)2527 static void add_xl(u8 s[32], u8 x)
2528 {
2529     u64 mod8  = x & 7;
2530     u64 carry = 0;
2531     FOR (i , 0, 8) {
2532         carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2533         store32_le(s + 4*i, (u32)carry);
2534         carry >>= 32;
2535     }
2536 }
2537 
2538 // "Small" dirty ephemeral key.
2539 // Use if you need to shrink the size of the binary, and can afford to
2540 // slow down by a factor of two (compared to the fast version)
2541 //
2542 // This version works by decoupling the cofactor from the main factor.
2543 //
2544 // - The trimmed scalar determines the main factor
2545 // - The clamped bits of the scalar determine the cofactor.
2546 //
2547 // Cofactor and main factor are combined into a single scalar, which is
2548 // then multiplied by a point of order 8*L (unlike the base point, which
2549 // has prime order).  That "dirty" base point is the addition of the
2550 // regular base point (9), and a point of order 8.
crypto_x25519_dirty_small(u8 public_key[32],const u8 secret_key[32])2551 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2552 {
2553     // Base point of order 8*L
2554     // Raw scalar multiplication with it does not clear the cofactor,
2555     // and the resulting public key will reveal 3 bits of the scalar.
2556     static const u8 dirty_base_point[32] = {
2557         0x34, 0xfc, 0x6c, 0xb7, 0xc8, 0xde, 0x58, 0x97, 0x77, 0x70, 0xd9, 0x52,
2558         0x16, 0xcc, 0xdc, 0x6c, 0x85, 0x90, 0xbe, 0xcd, 0x91, 0x9c, 0x07, 0x59,
2559         0x94, 0x14, 0x56, 0x3b, 0x4b, 0xa4, 0x47, 0x0f, };
2560     // separate the main factor & the cofactor of the scalar
2561     u8 scalar[32];
2562     COPY(scalar, secret_key, 32);
2563     trim_scalar(scalar);
2564 
2565     // Separate the main factor and the cofactor
2566     //
2567     // The scalar is trimmed, so its cofactor is cleared.  The three
2568     // least significant bits however still have a main factor.  We must
2569     // remove it for X25519 compatibility.
2570     //
2571     // We exploit the fact that 5*L = 1 (modulo 8)
2572     //   cofactor = lsb * 5 * L             (modulo 8*L)
2573     //   combined = scalar + cofactor       (modulo 8*L)
2574     //   combined = scalar + (lsb * 5 * L)  (modulo 8*L)
2575     add_xl(scalar, secret_key[0] * 5);
2576     scalarmult(public_key, scalar, dirty_base_point, 256);
2577     WIPE_BUFFER(scalar);
2578 }
2579 
2580 // "Fast" dirty ephemeral key
2581 // We use this one by default.
2582 //
2583 // This version works by performing a regular scalar multiplication,
2584 // then add a low order point.  The scalar multiplication is done in
2585 // Edwards space for more speed (*2 compared to the "small" version).
2586 // The cost is a bigger binary for programs that don't also sign messages.
crypto_x25519_dirty_fast(u8 public_key[32],const u8 secret_key[32])2587 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2588 {
2589     u8 scalar[32];
2590     ge pk;
2591     COPY(scalar, secret_key, 32);
2592     trim_scalar(scalar);
2593     ge_scalarmult_base(&pk, scalar);
2594 
2595     // Select low order point
2596     // We're computing the [cofactor]lop scalar multiplication, where:
2597     //   cofactor = tweak & 7.
2598     //   lop      = (lop_x, lop_y)
2599     //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
2600     //   lop_y    = -lop_x * sqrtm1
2601     // Notes:
2602     // - A (single) Montgomery ladder would be twice as slow.
2603     // - An actual scalar multiplication would hurt performance.
2604     // - A full table lookup would take more code.
2605     u8 cofactor = secret_key[0] & 7;
2606     int a = (cofactor >> 2) & 1;
2607     int b = (cofactor >> 1) & 1;
2608     int c = (cofactor >> 0) & 1;
2609     fe t1, t2, t3;
2610     fe_0(t1);
2611     fe_ccopy(t1, sqrtm1, b);
2612     fe_ccopy(t1, lop_x , c);
2613     fe_neg  (t3, t1);
2614     fe_ccopy(t1, t3, a);
2615     fe_1(t2);
2616     fe_0(t3);
2617     fe_ccopy(t2, t3   , b);
2618     fe_ccopy(t2, lop_y, c);
2619     fe_neg  (t3, t2);
2620     fe_ccopy(t2, t3, a^b);
2621     ge_precomp low_order_point;
2622     fe_add(low_order_point.Yp, t2, t1);
2623     fe_sub(low_order_point.Ym, t2, t1);
2624     fe_mul(low_order_point.T2, t2, t1);
2625     fe_mul(low_order_point.T2, low_order_point.T2, D2);
2626 
2627     // Add low order point to the public key
2628     ge_madd(&pk, &pk, &low_order_point, t1, t2);
2629 
2630     // Convert to Montgomery u coordinate (we ignore the sign)
2631     fe_add(t1, pk.Z, pk.Y);
2632     fe_sub(t2, pk.Z, pk.Y);
2633     fe_invert(t2, t2);
2634     fe_mul(t1, t1, t2);
2635 
2636     fe_tobytes(public_key, t1);
2637 
2638     WIPE_BUFFER(t1);  WIPE_BUFFER(scalar);
2639     WIPE_BUFFER(t2);  WIPE_CTX(&pk);
2640     WIPE_BUFFER(t3);  WIPE_CTX(&low_order_point);
2641 }
2642 
2643 ///////////////////
2644 /// Elligator 2 ///
2645 ///////////////////
2646 static const fe A = {486662};
2647 
2648 // Elligator direct map
2649 //
2650 // Computes the point corresponding to a representative, encoded in 32
2651 // bytes (little Endian).  Since positive representatives fits in 254
2652 // bits, The two most significant bits are ignored.
2653 //
2654 // From the paper:
2655 // w = -A / (fe(1) + non_square * r^2)
2656 // e = chi(w^3 + A*w^2 + w)
2657 // u = e*w - (fe(1)-e)*(A//2)
2658 // v = -e * sqrt(u^3 + A*u^2 + u)
2659 //
2660 // We ignore v because we don't need it for X25519 (the Montgomery
2661 // ladder only uses u).
2662 //
2663 // Note that e is either 0, 1 or -1
2664 // if e = 0    u = 0  and v = 0
2665 // if e = 1    u = w
2666 // if e = -1   u = -w - A = w * non_square * r^2
2667 //
2668 // Let r1 = non_square * r^2
2669 // Let r2 = 1 + r1
2670 // Note that r2 cannot be zero, -1/non_square is not a square.
2671 // We can (tediously) verify that:
2672 //   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2673 // Therefore:
2674 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2675 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2676 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2677 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
2678 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
2679 // Corollary:
2680 //   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
2681 //   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
2682 //   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2683 //     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2684 //     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2685 //     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
2686 //     which is zero only if:
2687 //       w = 0                   (impossible)
2688 //       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2689 //
2690 // Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
2691 //     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
2692 //     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
2693 //
2694 // if e = 1
2695 //   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2696 //       u1 = w
2697 //       u1 = u
2698 //
2699 // if e = -1
2700 //   let ufactor = -non_square * sqrt(-1) * r^2
2701 //   let vfactor = sqrt(ufactor)
2702 //   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2703 //       u2 = w * -1 * -non_square * r^2
2704 //       u2 = w * non_square * r^2
2705 //       u2 = u
crypto_hidden_to_curve(uint8_t curve[32],const uint8_t hidden[32])2706 void crypto_hidden_to_curve(uint8_t curve[32], const uint8_t hidden[32])
2707 {
2708     // Representatives are encoded in 254 bits.
2709     // The two most significant ones are random padding that must be ignored.
2710     u8 clamped[32];
2711     COPY(clamped, hidden, 32);
2712     clamped[31] &= 0x3f;
2713 
2714     fe r, u, t1, t2, t3;
2715     fe_frombytes(r, clamped);
2716     fe_sq2(t1, r);
2717     fe_add(u, t1, fe_one);
2718     fe_sq (t2, u);
2719     fe_mul(t3, A2, t1);
2720     fe_sub(t3, t3, t2);
2721     fe_mul(t3, t3, A);
2722     fe_mul(t1, t2, u);
2723     fe_mul(t1, t3, t1);
2724     int is_square = invsqrt(t1, t1);
2725     fe_sq(u, r);
2726     fe_mul(u, u, ufactor);
2727     fe_ccopy(u, fe_one, is_square);
2728     fe_sq (t1, t1);
2729     fe_mul(u, u, A);
2730     fe_mul(u, u, t3);
2731     fe_mul(u, u, t2);
2732     fe_mul(u, u, t1);
2733     fe_neg(u, u);
2734     fe_tobytes(curve, u);
2735 
2736     WIPE_BUFFER(t1);  WIPE_BUFFER(r);
2737     WIPE_BUFFER(t2);  WIPE_BUFFER(u);
2738     WIPE_BUFFER(t3);  WIPE_BUFFER(clamped);
2739 }
2740 
2741 // Elligator inverse map
2742 //
2743 // Computes the representative of a point, if possible.  If not, it does
2744 // nothing and returns -1.  Note that the success of the operation
2745 // depends only on the point (more precisely its u coordinate).  The
2746 // tweak parameter is used only upon success
2747 //
2748 // The tweak should be a random byte.  Beyond that, its contents are an
2749 // implementation detail. Currently, the tweak comprises:
2750 // - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
2751 // - Bit  2-5: not used
2752 // - Bits 6-7: random padding
2753 //
2754 // From the paper:
2755 // Let sq = -non_square * u * (u+A)
2756 // if sq is not a square, or u = -A, there is no mapping
2757 // Assuming there is a mapping:
2758 //   if v is positive: r = sqrt(-(u+A) / u)
2759 //   if v is negative: r = sqrt(-u / (u+A))
2760 //
2761 // We compute isr = invsqrt(-non_square * u * (u+A))
2762 // if it wasn't a non-zero square, abort.
2763 // else, isr = sqrt(-1 / (non_square * u * (u+A))
2764 //
2765 // This causes us to abort if u is zero, even though we shouldn't. This
2766 // never happens in practice, because (i) a random point in the curve has
2767 // a negligible chance of being zero, and (ii) scalar multiplication with
2768 // a trimmed scalar *never* yields zero.
2769 //
2770 // Since:
2771 //   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
2772 //   isr * (u+A) = sqrt(-(u+A) / (non_square * u * (u+A))
2773 // and:
2774 //   isr = u = sqrt(-1 / (non_square * u * (u+A)) * u
2775 //   isr = u = sqrt(-u / (non_square * u * (u+A))
2776 // Therefore:
2777 //   if v is positive: r = isr * (u+A)
2778 //   if v is negative: r = isr * u
crypto_curve_to_hidden(u8 hidden[32],const u8 public_key[32],u8 tweak)2779 int crypto_curve_to_hidden(u8 hidden[32], const u8 public_key[32], u8 tweak)
2780 {
2781     fe t1, t2, t3;
2782     fe_frombytes(t1, public_key);
2783 
2784     fe_add(t2, t1, A);
2785     fe_mul(t3, t1, t2);
2786     fe_mul_small(t3, t3, -2);
2787     int is_square = invsqrt(t3, t3);
2788     if (!is_square) {
2789         // The only variable time bit.  This ultimately reveals how many
2790         // tries it took us to find a representable key.
2791         // This does not affect security as long as we try keys at random.
2792         WIPE_BUFFER(t1);
2793         WIPE_BUFFER(t2);
2794         WIPE_BUFFER(t3);
2795         return -1;
2796     }
2797     fe_ccopy    (t1, t2, tweak & 1);
2798     fe_mul      (t3, t1, t3);
2799     fe_mul_small(t1, t3, 2);
2800     fe_neg      (t2, t3);
2801     fe_ccopy    (t3, t2, fe_isodd(t1));
2802     fe_tobytes(hidden, t3);
2803 
2804     // Pad with two random bits
2805     hidden[31] |= tweak & 0xc0;
2806 
2807     WIPE_BUFFER(t1);
2808     WIPE_BUFFER(t2);
2809     WIPE_BUFFER(t3);
2810     return 0;
2811 }
2812 
crypto_hidden_key_pair(u8 hidden[32],u8 secret_key[32],u8 seed[32])2813 void crypto_hidden_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2814 {
2815     u8 pk [32]; // public key
2816     u8 buf[64]; // seed + representative
2817     COPY(buf + 32, seed, 32);
2818     do {
2819         crypto_chacha20(buf, 0, 64, buf+32, zero);
2820         crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2821     } while(crypto_curve_to_hidden(buf+32, pk, buf[32]));
2822     // Note that the return value of crypto_curve_to_hidden() is
2823     // independent from its tweak parameter.
2824     // Therefore, buf[32] is not actually reused.  Either we loop one
2825     // more time and buf[32] is used for the new seed, or we succeeded,
2826     // and buf[32] becomes the tweak parameter.
2827 
2828     crypto_wipe(seed, 32);
2829     COPY(hidden    , buf + 32, 32);
2830     COPY(secret_key, buf     , 32);
2831     WIPE_BUFFER(buf);
2832     WIPE_BUFFER(pk);
2833 }
2834 
2835 ////////////////////
2836 /// Key exchange ///
2837 ////////////////////
crypto_key_exchange(u8 shared_key[32],const u8 your_secret_key[32],const u8 their_public_key[32])2838 void crypto_key_exchange(u8       shared_key[32],
2839                          const u8 your_secret_key [32],
2840                          const u8 their_public_key[32])
2841 {
2842     crypto_x25519(shared_key, your_secret_key, their_public_key);
2843     crypto_hchacha20(shared_key, shared_key, zero);
2844 }
2845 
2846 ///////////////////////
2847 /// Scalar division ///
2848 ///////////////////////
2849 
2850 // Montgomery reduction.
2851 // Divides x by (2^256), and reduces the result modulo L
2852 //
2853 // Precondition:
2854 //   x < L * 2^256
2855 // Constants:
2856 //   r = 2^256                 (makes division by r trivial)
2857 //   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
2858 // Algorithm:
2859 //   s = (x * k) % r
2860 //   t = x + s*L      (t is always a multiple of r)
2861 //   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
redc(u32 u[8],u32 x[16])2862 static void redc(u32 u[8], u32 x[16])
2863 {
2864     static const u32 k[8]  = { 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2865                                0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,};
2866     static const u32 l[8]  = { 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
2867                                0x00000000, 0x00000000, 0x00000000, 0x10000000,};
2868     // s = x * k (modulo 2^256)
2869     // This is cheaper than the full multiplication.
2870     u32 s[8] = {0};
2871     FOR (i, 0, 8) {
2872         u64 carry = 0;
2873         FOR (j, 0, 8-i) {
2874             carry  += s[i+j] + (u64)x[i] * k[j];
2875             s[i+j]  = (u32)carry;
2876             carry >>= 32;
2877         }
2878     }
2879     u32 t[16] = {0};
2880     multiply(t, s, l);
2881 
2882     // t = t + x
2883     u64 carry = 0;
2884     FOR (i, 0, 16) {
2885         carry  += (u64)t[i] + x[i];
2886         t[i]    = (u32)carry;
2887         carry >>= 32;
2888     }
2889 
2890     // u = (t / 2^256) % L
2891     // Note that t / 2^256 is always below 2*L,
2892     // So a constant time conditional subtraction is enough
2893     // We work with L directly, in a 2's complement encoding
2894     // (-L == ~L + 1)
2895     remove_l(u, t+8);
2896 
2897     WIPE_BUFFER(s);
2898     WIPE_BUFFER(t);
2899 }
2900 
crypto_x25519_inverse(u8 blind_salt[32],const u8 private_key[32],const u8 curve_point[32])2901 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2902                            const u8 curve_point[32])
2903 {
2904     static const  u8 Lm2[32] = { // L - 2
2905         0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2,
2906         0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2907         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, };
2908     // 1 in Montgomery form
2909     u32 m_inv [8] = {0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2910                      0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,};
2911 
2912     u8 scalar[32];
2913     COPY(scalar, private_key, 32);
2914     trim_scalar(scalar);
2915 
2916     // Convert the scalar in Montgomery form
2917     // m_scl = scalar * 2^256 (modulo L)
2918     u32 m_scl[8];
2919     {
2920         u32 tmp[16];
2921         ZERO(tmp, 8);
2922         load32_le_buf(tmp+8, scalar, 8);
2923         mod_l(scalar, tmp);
2924         load32_le_buf(m_scl, scalar, 8);
2925         WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2926     }
2927 
2928     u32 product[16];
2929     for (int i = 252; i >= 0; i--) {
2930         ZERO(product, 16);
2931         multiply(product, m_inv, m_inv);
2932         redc(m_inv, product);
2933         if (scalar_bit(Lm2, i)) {
2934             ZERO(product, 16);
2935             multiply(product, m_inv, m_scl);
2936             redc(m_inv, product);
2937         }
2938     }
2939     // Convert the inverse *out* of Montgomery form
2940     // scalar = m_inv / 2^256 (modulo L)
2941     COPY(product, m_inv, 8);
2942     ZERO(product + 8, 8);
2943     redc(m_inv, product);
2944     store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2945 
2946     // Clear the cofactor of scalar:
2947     //   cleared = scalar * (3*L + 1)      (modulo 8*L)
2948     //   cleared = scalar + scalar * 3 * L (modulo 8*L)
2949     // Note that (scalar * 3) is reduced modulo 8, so we only need the
2950     // first byte.
2951     add_xl(scalar, scalar[0] * 3);
2952 
2953     // Recall that 8*L < 2^256. However it is also very close to
2954     // 2^255. If we spanned the ladder over 255 bits, random tests
2955     // wouldn't catch the off-by-one error.
2956     scalarmult(blind_salt, scalar, curve_point, 256);
2957 
2958     WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
2959     WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
2960 }
2961 
2962 ////////////////////////////////
2963 /// Authenticated encryption ///
2964 ////////////////////////////////
lock_auth(u8 mac[16],const u8 auth_key[32],const u8 * ad,size_t ad_size,const u8 * cipher_text,size_t text_size)2965 static void lock_auth(u8 mac[16], const u8  auth_key[32],
2966                       const u8 *ad         , size_t ad_size,
2967                       const u8 *cipher_text, size_t text_size)
2968 {
2969     u8 sizes[16]; // Not secret, not wiped
2970     store64_le(sizes + 0, ad_size);
2971     store64_le(sizes + 8, text_size);
2972     crypto_poly1305_ctx poly_ctx;           // auto wiped...
2973     crypto_poly1305_init  (&poly_ctx, auth_key);
2974     crypto_poly1305_update(&poly_ctx, ad         , ad_size);
2975     crypto_poly1305_update(&poly_ctx, zero       , align(ad_size, 16));
2976     crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2977     crypto_poly1305_update(&poly_ctx, zero       , align(text_size, 16));
2978     crypto_poly1305_update(&poly_ctx, sizes      , 16);
2979     crypto_poly1305_final (&poly_ctx, mac); // ...here
2980 }
2981 
crypto_lock_aead(u8 mac[16],u8 * cipher_text,const u8 key[32],const u8 nonce[24],const u8 * ad,size_t ad_size,const u8 * plain_text,size_t text_size)2982 void crypto_lock_aead(u8 mac[16], u8 *cipher_text,
2983                       const u8  key[32], const u8  nonce[24],
2984                       const u8 *ad        , size_t ad_size,
2985                       const u8 *plain_text, size_t text_size)
2986 {
2987     u8 sub_key[32];
2988     u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
2989     crypto_hchacha20(sub_key, key, nonce);
2990     crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
2991     crypto_chacha20_ctr(cipher_text, plain_text, text_size,
2992                         sub_key, nonce + 16, 1);
2993     lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
2994     WIPE_BUFFER(sub_key);
2995     WIPE_BUFFER(auth_key);
2996 }
2997 
crypto_unlock_aead(u8 * plain_text,const u8 key[32],const u8 nonce[24],const u8 mac[16],const u8 * ad,size_t ad_size,const u8 * cipher_text,size_t text_size)2998 int crypto_unlock_aead(u8 *plain_text, const u8 key[32], const u8 nonce[24],
2999                        const u8  mac[16],
3000                        const u8 *ad         , size_t ad_size,
3001                        const u8 *cipher_text, size_t text_size)
3002 {
3003     u8 sub_key[32];
3004     u8 auth_key[64]; // "Wasting" the whole Chacha block is faster
3005     crypto_hchacha20(sub_key, key, nonce);
3006     crypto_chacha20(auth_key, 0, 64, sub_key, nonce + 16);
3007     u8 real_mac[16];
3008     lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
3009     WIPE_BUFFER(auth_key);
3010     if (crypto_verify16(mac, real_mac)) {
3011         WIPE_BUFFER(sub_key);
3012         WIPE_BUFFER(real_mac);
3013         return -1;
3014     }
3015     crypto_chacha20_ctr(plain_text, cipher_text, text_size,
3016                         sub_key, nonce + 16, 1);
3017     WIPE_BUFFER(sub_key);
3018     WIPE_BUFFER(real_mac);
3019     return 0;
3020 }
3021 
crypto_lock(u8 mac[16],u8 * cipher_text,const u8 key[32],const u8 nonce[24],const u8 * plain_text,size_t text_size)3022 void crypto_lock(u8 mac[16], u8 *cipher_text,
3023                  const u8 key[32], const u8 nonce[24],
3024                  const u8 *plain_text, size_t text_size)
3025 {
3026     crypto_lock_aead(mac, cipher_text, key, nonce, 0, 0, plain_text, text_size);
3027 }
3028 
crypto_unlock(u8 * plain_text,const u8 key[32],const u8 nonce[24],const u8 mac[16],const u8 * cipher_text,size_t text_size)3029 int crypto_unlock(u8 *plain_text,
3030                   const u8 key[32], const u8 nonce[24], const u8 mac[16],
3031                   const u8 *cipher_text, size_t text_size)
3032 {
3033     return crypto_unlock_aead(plain_text, key, nonce, mac, 0, 0,
3034                               cipher_text, text_size);
3035 }
3036