1 /* ===================================================================
2  *
3  * Copyright (c) 2018, Helder Eijs <helderijs@gmail.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  * ===================================================================
30  */
31 
32 #include "common.h"
33 #include "endianess.h"
34 
35 FAKE_INIT(poly1305)
36 
37 typedef struct mac_state_t {
38     uint32_t r[4], rr[4];   /** first key - variable in polynomial **/
39     uint32_t s[5];          /** second key - fixed term in polynomial **/
40     uint32_t h[5];          /** state **/
41 
42     uint8_t buffer[16];     /** temp input **/
43     unsigned buffer_used;
44 } mac_state;
45 
46 /*
47  * Load 16 bytes as the secret r, which is the value we evaluate the polynomial
48  * with, modulo 2^130-5.
49  *
50  * The secret gets encoded into four 32-bit words (r[]), after appropriate clamping
51  * (reset) is applied to 22 of its bits.
52  *
53  * Additionaly, reduce modulo 2^130-5 the value 2^130*r into rr[], which we can
54  * reuse several times later during each multiplication.
55  *
56  * @param[out]  r:      The 4-word array with the r value (little-endian)
57  * @param[out]  rr:     The 4-word array with the value (r * 2^130) modulo 2^130-5 (little-endian)
58  * @param[in] secret:   The 16 bytes encoding r (not necessarily clamped already)
59  */
poly1305_load_r(uint32_t r[4],uint32_t rr[4],const uint8_t secret[16])60 STATIC void poly1305_load_r(uint32_t r[4], uint32_t rr[4], const uint8_t secret[16])
61 {
62     unsigned i;
63     uint32_t mask;
64 
65     for (i=0; i<4; i++) {
66         /**
67          * The 4 most significant bits in a word are reset.
68          * The 2 least significant bits in a word are reset, except for r[0]
69          */
70         mask = (i==0) ? 0x0FFFFFFFU : 0x0FFFFFFCU;
71         r[i] = LOAD_U32_LITTLE(secret+i*4) & mask;
72         rr[i] = (r[i] >> 2)*5;
73     }
74 }
75 
76 /*
77  * Load the next chunk of message as an integer.
78  *
79  * @param[out] m:       The 5-word array the chunk will be read into (little-endian)
80  * @param[in]  data:    The next chunk of message, at most 16 bytes. It is
81  *                      smaller than 16 only if it is the last chunk.
82  * @param[in]  len:     The length of the chunk (<=16)
83  */
poly1305_load_m(uint32_t m[5],const uint8_t data[],size_t len)84 STATIC void poly1305_load_m(uint32_t m[5], const uint8_t data[], size_t len)
85 {
86     uint8_t copy[sizeof(uint32_t)*5];
87 
88     assert(len<=16);
89 
90     memset(copy, 0, sizeof(copy));
91     memcpy(copy, data, len);
92     copy[len] = 1;  /** 2^128 or 2^{8*(l mod 16)} **/
93 
94     m[0] = LOAD_U32_LITTLE(copy);
95     m[1] = LOAD_U32_LITTLE(copy+4);
96     m[2] = LOAD_U32_LITTLE(copy+8);
97     m[3] = LOAD_U32_LITTLE(copy+12);
98     m[4] = LOAD_U32_LITTLE(copy+16);
99 }
100 
101 /*
102  * Load 16 bytes as the secret s, which is the fixed term for the polynomial, modulo 2^130-5.
103  *
104  * @param[out] m:   The 5-word array that will contain the secret s (little-endian)
105  * @param[in] s:    The 16 bytes that encode the value s. It is typically the
106  *                  result of an AES of ChaCha20 encryption.
107  */
poly1305_load_s(uint32_t m[5],const uint8_t s[16])108 static void poly1305_load_s(uint32_t m[5], const uint8_t s[16])
109 {
110     m[0] = LOAD_U32_LITTLE(s);
111     m[1] = LOAD_U32_LITTLE(s+4);
112     m[2] = LOAD_U32_LITTLE(s+8);
113     m[3] = LOAD_U32_LITTLE(s+12);
114     m[4] = 0;
115 }
116 
117 /**
118  * Multiply a value by the secret r, "almost" modulo 2^130-5.
119  *
120  * @param[in,out] h:    The 5-word array with the value to multiply (little-endian).
121  *                      The result is stored back here.
122  *                      The result is guaranteed to be smaller than 2^131 (not 2^130-5,
123  *                      hence the "almost" modulo) for any value of h[] in input.
124  * @param[in] r:        The 4-word array with the multiplier, as generated by
125  *                      poly1305_load_r() (little-endian).
126  * @param[in] rr:       The 4-word array with the other value generated by
127  *                      poly1305__load_r() for the same multipler (little-endian).
128  */
poly1305_multiply(uint32_t h[5],const uint32_t r[4],const uint32_t rr[4])129 STATIC void poly1305_multiply(uint32_t h[5], const uint32_t r[4], const uint32_t rr[4])
130 {
131     uint64_t a0, a1, a2, a3;
132     uint64_t aa0, aa1, aa2, aa3;
133     uint64_t x0, x1, x2, x3, x4;
134     uint64_t carry;
135 
136     /*
137      * Boundaries
138      * - h[0..4]  <   2^32
139      * - r[0..3]  <   2^28 < 5*2^26
140      * - rr[0..3] < 5*2^26
141      */
142 
143     a0 = r[0];
144     a1 = r[1];
145     a2 = r[2];
146     a3 = r[3];
147     aa0 = rr[0];
148     aa1 = rr[1];
149     aa2 = rr[2];
150     aa3 = rr[3];
151 
152     /**
153      * Schoolbook multiplication between h[] and  r[], with the caveat that
154      * the components exceeding 2^130 are folded back with a right shift and
155      * a multiplication by 5 (already precomputed in rr[]).
156      *
157      * Each sum is guaranteed to be smaller than 2^63 (x0 being the worst case).
158      */
159     x0 = a0*h[0] + aa0*h[4] + aa1*h[3] + aa2*h[2] + aa3*h[1];
160     x1 = a0*h[1] +  a1*h[0] + aa1*h[4] + aa2*h[3] + aa3*h[2];
161     x2 = a0*h[2] +  a1*h[1] +  a2*h[0] + aa2*h[4] + aa3*h[3];
162     x3 = a0*h[3] +  a1*h[2] +  a2*h[1] +  a3*h[0] + aa3*h[4];
163     x4 = (a0 & 3)*h[4]; /** < 2^34 **/
164 
165     /** Clear upper half of x3 **/
166     x4 += x3 >> 32;
167     x3 &= UINT32_MAX;
168 
169     /** Clear the 62 most significant bits of x4 and
170      *  create carry for x0 **/
171     carry = (x4 >> 2)*5;    /** < 2^35 **/
172     x4 &= 3;
173 
174     /** Reduce x0 to 32 bits and store into h0 **/
175     x0 += carry;
176     h[0] = x0 & UINT32_MAX;
177     carry = x0 >> 32;
178 
179     /** Reduce x1 to 32 bits and store into h1 **/
180     x1 += carry;
181     h[1] = x1 & UINT32_MAX;
182     carry = x1 >> 32;
183 
184     /** Reduce x2 to 32 bits and store into h2 **/
185     x2 += carry;
186     h[2] = x2 & UINT32_MAX;
187     carry = x2 >> 32;
188 
189     /** Reduce x3 to 32 bits and store into h3 **/
190     x3 += carry;
191     h[3] = x3 & UINT32_MAX;
192     carry = x3 >> 32;   /** < 1 **/
193 
194     /** Reduce x4 to 32 bits and store into h4 **/
195     x4 += carry;    /** < 2^3 **/
196     assert(x4 < 8);
197     h[4] = (uint32_t)x4;
198 }
199 
200 /*
201  * Reduce a value h[] modulo 2^130-5.
202  *
203  * @param[in,out] h:    The 5-word array with the value to reduce (little-endian).
204  *                      The result is stored back here and it is guaranteed to
205  *                      be smaller than 2^130- 5.
206  *                      The incoming value h must be smaller than 2^131.
207  */
poly1305_reduce(uint32_t h[5])208 STATIC void poly1305_reduce(uint32_t h[5])
209 {
210     unsigned i;
211 
212     assert(h[4]<8);
213 
214     for (i=0; i<2; i++) {
215         uint32_t mask, carry;
216         uint32_t g[5];
217 
218         /** Compute h+(-p) by adding and removing 2^130 **/
219         g[0] = h[0] + 5;     carry = g[0] < h[0];
220         g[1] = h[1] + carry; carry = g[1] < h[1];
221         g[2] = h[2] + carry; carry = g[2] < h[2];
222         g[3] = h[3] + carry; carry = g[3] < h[3];
223         g[4] = h[4] + carry - 4;
224 
225         mask = (g[4] >> 31) - 1U;    /** All 1s if g[] is a valid reduction **/
226         h[0] = (h[0] & ~mask) ^ (g[0] & mask);
227         h[1] = (h[1] & ~mask) ^ (g[1] & mask);
228         h[2] = (h[2] & ~mask) ^ (g[2] & mask);
229         h[3] = (h[3] & ~mask) ^ (g[3] & mask);
230         h[4] = (h[4] & ~mask) ^ (g[4] & mask);
231     }
232 }
233 
234 /**
235  * Add two values.
236  *
237  * It must be assured that the sum does not exceed 2^160.
238  *
239  * @param[in,out] h:    The 5-word variable to accumulate into (little-endian).
240  * @param[in]     m:    The other 5-word term to add (little-endian).
241  */
poly1305_accumulate(uint32_t h[5],const uint32_t m[5])242 STATIC void poly1305_accumulate(uint32_t h[5], const uint32_t m[5])
243 {
244 #if 0
245     // 128-bit type exist and little-endian
246     uint32_t carry;
247     __uint128_t a, b, c;
248 
249     memcpy(&a, h, 16);
250     memcpy(&b, m, 16);
251     c = a + b; carry = c < a;
252     memcpy(h, &c, 16);
253     h[4] += m[4] + carry;
254 #else
255     uint8_t carry;
256     uint64_t tmp;
257 
258     h[0] += m[0];
259     carry = h[0] < m[0];
260 
261     tmp = (uint64_t)h[1] + m[1] + carry;
262     h[1] = (uint32_t) tmp;
263     carry = (tmp >> 32) & 1;
264 
265     tmp = (uint64_t)h[2] + m[2] + carry;
266     h[2] = (uint32_t) tmp;
267     carry = (tmp >> 32) & 1;
268 
269     tmp = (uint64_t)h[3] + m[3] + carry;
270     h[3] = (uint32_t) tmp;
271     carry = (tmp >> 32) & 1;
272 
273     tmp = (uint64_t)h[4] + m[4] + carry;
274     h[4] = (uint32_t) tmp;
275 
276     assert((tmp >> 32) == 0);
277 #endif
278 }
279 
280 /**
281  * Process the next chunk of the message.
282  *
283  * This procedure performs the following operation (assuming that msg is 16 byte long):
284  *
285  *     h = r * (h + (2^128 + little_endian_int(msg))) quasi-modulo 2^130-5
286  *
287  * Quasi-modulo means that the computations are performed modulo 2^130-5 but the
288  * result is still only guaranteed to be smaller than 2^131.
289  *
290  * @param[in,out] h:    The 5-word variable to accumulate into.
291  *                      In input, it must be smaller than 2^131.
292  *                      In output, it is guranteed to remain smaller than 2^131.
293  * @param[in] r:        The 4-word array with the multiplier, as generated by
294  *                      poly1305_load_r()
295  * @param[in] rr:       The 4-word array with the other value generated by
296  *                      poly1305__load_r() for the same multipler.
297  * @param[in]  data:    The next chunk of message, at most 16 bytes. It is
298  *                      smaller than 16 only if it is the last chunk.
299  * @param[in]  len:     The length of chunk (<=16)
300  */
poly1305_process(uint32_t h[5],uint32_t r[4],uint32_t rr[4],uint8_t msg[],size_t len)301 static void poly1305_process(uint32_t h[5], uint32_t r[4], uint32_t rr[4], uint8_t msg[], size_t len)
302 {
303     uint32_t m[5];
304 
305     if (len == 0)
306         return;
307 
308     poly1305_load_m(m, msg, len);
309     poly1305_accumulate(h, m);      /** We add two values that don't exceed 2^131, so
310                                       * this addition will not overflow 2^160.
311                                       */
312     poly1305_multiply(h, r, rr);
313 }
314 
315 /*
316  * Terminate processing of the message and create the final MAC tag.
317  *
318  * @param[in,out] h:    The 5-word variable where the resulting MAC must be put into,
319  *                      truncated to 128 bits.
320  *                      In input, it contains the value the polynomial has been evaluated at,
321  *                      without the fixed term. The input is smaller than 2^131.
322  * @param[in] s:        The 5-word value s, that is, the fixed term of the
323  *                      polynomial, as created by poly1305_load_s().
324  */
poly1305_finalize(uint32_t h[5],const uint32_t s[5])325 static void poly1305_finalize(uint32_t h[5], const uint32_t s[5])
326 {
327     poly1305_reduce(h);
328     poly1305_accumulate(h, s);
329     h[4] = 0;   /** modulo 2**128 **/
330 }
331 
332 /* --------------------------------------------------------- */
333 
poly1305_init(mac_state ** pState,const uint8_t r[16],size_t r_len,const uint8_t s[16],size_t s_len)334 EXPORT_SYM int poly1305_init(mac_state **pState,
335                              const uint8_t r[16],
336                              size_t r_len,
337                              const uint8_t s[16],
338                              size_t s_len)
339 {
340     mac_state *ms;
341 
342     if (NULL == pState || NULL == r || NULL == s)
343         return ERR_NULL;
344 
345     if (r_len != 16 || s_len != 16)
346         return ERR_KEY_SIZE;
347 
348     *pState = ms = (mac_state*) calloc(1, sizeof(mac_state));
349     if (NULL == ms)
350         return ERR_MEMORY;
351 
352     poly1305_load_r(ms->r, ms->rr, r);
353     poly1305_load_s(ms->s, s);
354 
355     return 0;
356 }
357 
poly1305_destroy(mac_state * state)358 EXPORT_SYM int poly1305_destroy(mac_state *state)
359 {
360     if (NULL == state)
361         return ERR_NULL;
362     free(state);
363     return 0;
364 }
365 
poly1305_update(mac_state * state,const uint8_t * in,size_t len)366 EXPORT_SYM int poly1305_update(mac_state *state,
367                                const uint8_t *in,
368                                size_t len)
369 {
370     if (NULL == state || NULL == in)
371         return ERR_NULL;
372 
373     while (len>0) {
374         unsigned btc;
375 
376         btc = (unsigned)MIN(len, 16 - state->buffer_used);
377         memcpy(state->buffer + state->buffer_used, in, btc);
378         state->buffer_used += btc;
379         in += btc;
380         len -= btc;
381 
382         if (state->buffer_used == 16) {
383             poly1305_process(state->h, state->r, state->rr, state->buffer, 16);
384             state->buffer_used = 0;
385         }
386     }
387 
388     return 0;
389 }
390 
poly1305_digest(const mac_state * state,uint8_t digest[16],size_t len)391 EXPORT_SYM int poly1305_digest(const mac_state *state,
392                                uint8_t digest[16],
393                                size_t len)
394 {
395     mac_state temp;
396     unsigned i;
397 
398     if (NULL == state || NULL == digest) {
399         return ERR_NULL;
400     }
401 
402     if (len != 16)
403         return ERR_DIGEST_SIZE;
404 
405     temp = *state;
406 
407     if (temp.buffer_used > 0) {
408         poly1305_process(temp.h, temp.r, temp.rr, temp.buffer, temp.buffer_used);
409     }
410 
411     poly1305_finalize(temp.h, temp.s);
412 
413     for (i=0; i<4; i++) {
414         STORE_U32_LITTLE(digest+i*4, temp.h[i]);
415     }
416 
417     return 0;
418 }
419 
420 #ifdef PROFILE
main(void)421 int main(void)
422 {
423     const unsigned data_size = 1024*1024;
424     mac_state *state;
425     const uint8_t r[16] = "1234567890123456";
426     const uint8_t s[16] = "1234567890123456";
427     uint8_t *data;
428 
429     data = malloc(data_size);
430     for (int i=0; i<data_size; i++) {
431         data[i] = (uint8_t) i;
432     }
433 
434     poly1305_init(&state, r, 16, s, 16);
435 
436     for (int i=0; i<1024; i++)
437         poly1305_update(state, data, 1024*1024);
438 
439     poly1305_destroy(state);
440     free(data);
441 }
442 #endif
443