1 /* ===================================================================
2 *
3 * Copyright (c) 2018, Helder Eijs <helderijs@gmail.com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 * ===================================================================
30 */
31
32 #include "common.h"
33 #include "endianess.h"
34
35 FAKE_INIT(poly1305)
36
37 typedef struct mac_state_t {
38 uint32_t r[4], rr[4]; /** first key - variable in polynomial **/
39 uint32_t s[5]; /** second key - fixed term in polynomial **/
40 uint32_t h[5]; /** state **/
41
42 uint8_t buffer[16]; /** temp input **/
43 unsigned buffer_used;
44 } mac_state;
45
46 /*
47 * Load 16 bytes as the secret r, which is the value we evaluate the polynomial
48 * with, modulo 2^130-5.
49 *
50 * The secret gets encoded into four 32-bit words (r[]), after appropriate clamping
51 * (reset) is applied to 22 of its bits.
52 *
53 * Additionaly, reduce modulo 2^130-5 the value 2^130*r into rr[], which we can
54 * reuse several times later during each multiplication.
55 *
56 * @param[out] r: The 4-word array with the r value (little-endian)
57 * @param[out] rr: The 4-word array with the value (r * 2^130) modulo 2^130-5 (little-endian)
58 * @param[in] secret: The 16 bytes encoding r (not necessarily clamped already)
59 */
poly1305_load_r(uint32_t r[4],uint32_t rr[4],const uint8_t secret[16])60 STATIC void poly1305_load_r(uint32_t r[4], uint32_t rr[4], const uint8_t secret[16])
61 {
62 unsigned i;
63 uint32_t mask;
64
65 for (i=0; i<4; i++) {
66 /**
67 * The 4 most significant bits in a word are reset.
68 * The 2 least significant bits in a word are reset, except for r[0]
69 */
70 mask = (i==0) ? 0x0FFFFFFFU : 0x0FFFFFFCU;
71 r[i] = LOAD_U32_LITTLE(secret+i*4) & mask;
72 rr[i] = (r[i] >> 2)*5;
73 }
74 }
75
76 /*
77 * Load the next chunk of message as an integer.
78 *
79 * @param[out] m: The 5-word array the chunk will be read into (little-endian)
80 * @param[in] data: The next chunk of message, at most 16 bytes. It is
81 * smaller than 16 only if it is the last chunk.
82 * @param[in] len: The length of the chunk (<=16)
83 */
poly1305_load_m(uint32_t m[5],const uint8_t data[],size_t len)84 STATIC void poly1305_load_m(uint32_t m[5], const uint8_t data[], size_t len)
85 {
86 uint8_t copy[sizeof(uint32_t)*5];
87
88 assert(len<=16);
89
90 memset(copy, 0, sizeof(copy));
91 memcpy(copy, data, len);
92 copy[len] = 1; /** 2^128 or 2^{8*(l mod 16)} **/
93
94 m[0] = LOAD_U32_LITTLE(copy);
95 m[1] = LOAD_U32_LITTLE(copy+4);
96 m[2] = LOAD_U32_LITTLE(copy+8);
97 m[3] = LOAD_U32_LITTLE(copy+12);
98 m[4] = LOAD_U32_LITTLE(copy+16);
99 }
100
101 /*
102 * Load 16 bytes as the secret s, which is the fixed term for the polynomial, modulo 2^130-5.
103 *
104 * @param[out] m: The 5-word array that will contain the secret s (little-endian)
105 * @param[in] s: The 16 bytes that encode the value s. It is typically the
106 * result of an AES of ChaCha20 encryption.
107 */
poly1305_load_s(uint32_t m[5],const uint8_t s[16])108 static void poly1305_load_s(uint32_t m[5], const uint8_t s[16])
109 {
110 m[0] = LOAD_U32_LITTLE(s);
111 m[1] = LOAD_U32_LITTLE(s+4);
112 m[2] = LOAD_U32_LITTLE(s+8);
113 m[3] = LOAD_U32_LITTLE(s+12);
114 m[4] = 0;
115 }
116
117 /**
118 * Multiply a value by the secret r, "almost" modulo 2^130-5.
119 *
120 * @param[in,out] h: The 5-word array with the value to multiply (little-endian).
121 * The result is stored back here.
122 * The result is guaranteed to be smaller than 2^131 (not 2^130-5,
123 * hence the "almost" modulo) for any value of h[] in input.
124 * @param[in] r: The 4-word array with the multiplier, as generated by
125 * poly1305_load_r() (little-endian).
126 * @param[in] rr: The 4-word array with the other value generated by
127 * poly1305__load_r() for the same multipler (little-endian).
128 */
poly1305_multiply(uint32_t h[5],const uint32_t r[4],const uint32_t rr[4])129 STATIC void poly1305_multiply(uint32_t h[5], const uint32_t r[4], const uint32_t rr[4])
130 {
131 uint64_t a0, a1, a2, a3;
132 uint64_t aa0, aa1, aa2, aa3;
133 uint64_t x0, x1, x2, x3, x4;
134 uint64_t carry;
135
136 /*
137 * Boundaries
138 * - h[0..4] < 2^32
139 * - r[0..3] < 2^28 < 5*2^26
140 * - rr[0..3] < 5*2^26
141 */
142
143 a0 = r[0];
144 a1 = r[1];
145 a2 = r[2];
146 a3 = r[3];
147 aa0 = rr[0];
148 aa1 = rr[1];
149 aa2 = rr[2];
150 aa3 = rr[3];
151
152 /**
153 * Schoolbook multiplication between h[] and r[], with the caveat that
154 * the components exceeding 2^130 are folded back with a right shift and
155 * a multiplication by 5 (already precomputed in rr[]).
156 *
157 * Each sum is guaranteed to be smaller than 2^63 (x0 being the worst case).
158 */
159 x0 = a0*h[0] + aa0*h[4] + aa1*h[3] + aa2*h[2] + aa3*h[1];
160 x1 = a0*h[1] + a1*h[0] + aa1*h[4] + aa2*h[3] + aa3*h[2];
161 x2 = a0*h[2] + a1*h[1] + a2*h[0] + aa2*h[4] + aa3*h[3];
162 x3 = a0*h[3] + a1*h[2] + a2*h[1] + a3*h[0] + aa3*h[4];
163 x4 = (a0 & 3)*h[4]; /** < 2^34 **/
164
165 /** Clear upper half of x3 **/
166 x4 += x3 >> 32;
167 x3 &= UINT32_MAX;
168
169 /** Clear the 62 most significant bits of x4 and
170 * create carry for x0 **/
171 carry = (x4 >> 2)*5; /** < 2^35 **/
172 x4 &= 3;
173
174 /** Reduce x0 to 32 bits and store into h0 **/
175 x0 += carry;
176 h[0] = x0 & UINT32_MAX;
177 carry = x0 >> 32;
178
179 /** Reduce x1 to 32 bits and store into h1 **/
180 x1 += carry;
181 h[1] = x1 & UINT32_MAX;
182 carry = x1 >> 32;
183
184 /** Reduce x2 to 32 bits and store into h2 **/
185 x2 += carry;
186 h[2] = x2 & UINT32_MAX;
187 carry = x2 >> 32;
188
189 /** Reduce x3 to 32 bits and store into h3 **/
190 x3 += carry;
191 h[3] = x3 & UINT32_MAX;
192 carry = x3 >> 32; /** < 1 **/
193
194 /** Reduce x4 to 32 bits and store into h4 **/
195 x4 += carry; /** < 2^3 **/
196 assert(x4 < 8);
197 h[4] = (uint32_t)x4;
198 }
199
200 /*
201 * Reduce a value h[] modulo 2^130-5.
202 *
203 * @param[in,out] h: The 5-word array with the value to reduce (little-endian).
204 * The result is stored back here and it is guaranteed to
205 * be smaller than 2^130- 5.
206 * The incoming value h must be smaller than 2^131.
207 */
poly1305_reduce(uint32_t h[5])208 STATIC void poly1305_reduce(uint32_t h[5])
209 {
210 unsigned i;
211
212 assert(h[4]<8);
213
214 for (i=0; i<2; i++) {
215 uint32_t mask, carry;
216 uint32_t g[5];
217
218 /** Compute h+(-p) by adding and removing 2^130 **/
219 g[0] = h[0] + 5; carry = g[0] < h[0];
220 g[1] = h[1] + carry; carry = g[1] < h[1];
221 g[2] = h[2] + carry; carry = g[2] < h[2];
222 g[3] = h[3] + carry; carry = g[3] < h[3];
223 g[4] = h[4] + carry - 4;
224
225 mask = (g[4] >> 31) - 1U; /** All 1s if g[] is a valid reduction **/
226 h[0] = (h[0] & ~mask) ^ (g[0] & mask);
227 h[1] = (h[1] & ~mask) ^ (g[1] & mask);
228 h[2] = (h[2] & ~mask) ^ (g[2] & mask);
229 h[3] = (h[3] & ~mask) ^ (g[3] & mask);
230 h[4] = (h[4] & ~mask) ^ (g[4] & mask);
231 }
232 }
233
234 /**
235 * Add two values.
236 *
237 * It must be assured that the sum does not exceed 2^160.
238 *
239 * @param[in,out] h: The 5-word variable to accumulate into (little-endian).
240 * @param[in] m: The other 5-word term to add (little-endian).
241 */
poly1305_accumulate(uint32_t h[5],const uint32_t m[5])242 STATIC void poly1305_accumulate(uint32_t h[5], const uint32_t m[5])
243 {
244 #if 0
245 // 128-bit type exist and little-endian
246 uint32_t carry;
247 __uint128_t a, b, c;
248
249 memcpy(&a, h, 16);
250 memcpy(&b, m, 16);
251 c = a + b; carry = c < a;
252 memcpy(h, &c, 16);
253 h[4] += m[4] + carry;
254 #else
255 uint8_t carry;
256 uint64_t tmp;
257
258 h[0] += m[0];
259 carry = h[0] < m[0];
260
261 tmp = (uint64_t)h[1] + m[1] + carry;
262 h[1] = (uint32_t) tmp;
263 carry = (tmp >> 32) & 1;
264
265 tmp = (uint64_t)h[2] + m[2] + carry;
266 h[2] = (uint32_t) tmp;
267 carry = (tmp >> 32) & 1;
268
269 tmp = (uint64_t)h[3] + m[3] + carry;
270 h[3] = (uint32_t) tmp;
271 carry = (tmp >> 32) & 1;
272
273 tmp = (uint64_t)h[4] + m[4] + carry;
274 h[4] = (uint32_t) tmp;
275
276 assert((tmp >> 32) == 0);
277 #endif
278 }
279
280 /**
281 * Process the next chunk of the message.
282 *
283 * This procedure performs the following operation (assuming that msg is 16 byte long):
284 *
285 * h = r * (h + (2^128 + little_endian_int(msg))) quasi-modulo 2^130-5
286 *
287 * Quasi-modulo means that the computations are performed modulo 2^130-5 but the
288 * result is still only guaranteed to be smaller than 2^131.
289 *
290 * @param[in,out] h: The 5-word variable to accumulate into.
291 * In input, it must be smaller than 2^131.
292 * In output, it is guranteed to remain smaller than 2^131.
293 * @param[in] r: The 4-word array with the multiplier, as generated by
294 * poly1305_load_r()
295 * @param[in] rr: The 4-word array with the other value generated by
296 * poly1305__load_r() for the same multipler.
297 * @param[in] data: The next chunk of message, at most 16 bytes. It is
298 * smaller than 16 only if it is the last chunk.
299 * @param[in] len: The length of chunk (<=16)
300 */
poly1305_process(uint32_t h[5],uint32_t r[4],uint32_t rr[4],uint8_t msg[],size_t len)301 static void poly1305_process(uint32_t h[5], uint32_t r[4], uint32_t rr[4], uint8_t msg[], size_t len)
302 {
303 uint32_t m[5];
304
305 if (len == 0)
306 return;
307
308 poly1305_load_m(m, msg, len);
309 poly1305_accumulate(h, m); /** We add two values that don't exceed 2^131, so
310 * this addition will not overflow 2^160.
311 */
312 poly1305_multiply(h, r, rr);
313 }
314
315 /*
316 * Terminate processing of the message and create the final MAC tag.
317 *
318 * @param[in,out] h: The 5-word variable where the resulting MAC must be put into,
319 * truncated to 128 bits.
320 * In input, it contains the value the polynomial has been evaluated at,
321 * without the fixed term. The input is smaller than 2^131.
322 * @param[in] s: The 5-word value s, that is, the fixed term of the
323 * polynomial, as created by poly1305_load_s().
324 */
poly1305_finalize(uint32_t h[5],const uint32_t s[5])325 static void poly1305_finalize(uint32_t h[5], const uint32_t s[5])
326 {
327 poly1305_reduce(h);
328 poly1305_accumulate(h, s);
329 h[4] = 0; /** modulo 2**128 **/
330 }
331
332 /* --------------------------------------------------------- */
333
poly1305_init(mac_state ** pState,const uint8_t r[16],size_t r_len,const uint8_t s[16],size_t s_len)334 EXPORT_SYM int poly1305_init(mac_state **pState,
335 const uint8_t r[16],
336 size_t r_len,
337 const uint8_t s[16],
338 size_t s_len)
339 {
340 mac_state *ms;
341
342 if (NULL == pState || NULL == r || NULL == s)
343 return ERR_NULL;
344
345 if (r_len != 16 || s_len != 16)
346 return ERR_KEY_SIZE;
347
348 *pState = ms = (mac_state*) calloc(1, sizeof(mac_state));
349 if (NULL == ms)
350 return ERR_MEMORY;
351
352 poly1305_load_r(ms->r, ms->rr, r);
353 poly1305_load_s(ms->s, s);
354
355 return 0;
356 }
357
poly1305_destroy(mac_state * state)358 EXPORT_SYM int poly1305_destroy(mac_state *state)
359 {
360 if (NULL == state)
361 return ERR_NULL;
362 free(state);
363 return 0;
364 }
365
poly1305_update(mac_state * state,const uint8_t * in,size_t len)366 EXPORT_SYM int poly1305_update(mac_state *state,
367 const uint8_t *in,
368 size_t len)
369 {
370 if (NULL == state || NULL == in)
371 return ERR_NULL;
372
373 while (len>0) {
374 unsigned btc;
375
376 btc = (unsigned)MIN(len, 16 - state->buffer_used);
377 memcpy(state->buffer + state->buffer_used, in, btc);
378 state->buffer_used += btc;
379 in += btc;
380 len -= btc;
381
382 if (state->buffer_used == 16) {
383 poly1305_process(state->h, state->r, state->rr, state->buffer, 16);
384 state->buffer_used = 0;
385 }
386 }
387
388 return 0;
389 }
390
poly1305_digest(const mac_state * state,uint8_t digest[16],size_t len)391 EXPORT_SYM int poly1305_digest(const mac_state *state,
392 uint8_t digest[16],
393 size_t len)
394 {
395 mac_state temp;
396 unsigned i;
397
398 if (NULL == state || NULL == digest) {
399 return ERR_NULL;
400 }
401
402 if (len != 16)
403 return ERR_DIGEST_SIZE;
404
405 temp = *state;
406
407 if (temp.buffer_used > 0) {
408 poly1305_process(temp.h, temp.r, temp.rr, temp.buffer, temp.buffer_used);
409 }
410
411 poly1305_finalize(temp.h, temp.s);
412
413 for (i=0; i<4; i++) {
414 STORE_U32_LITTLE(digest+i*4, temp.h[i]);
415 }
416
417 return 0;
418 }
419
420 #ifdef PROFILE
main(void)421 int main(void)
422 {
423 const unsigned data_size = 1024*1024;
424 mac_state *state;
425 const uint8_t r[16] = "1234567890123456";
426 const uint8_t s[16] = "1234567890123456";
427 uint8_t *data;
428
429 data = malloc(data_size);
430 for (int i=0; i<data_size; i++) {
431 data[i] = (uint8_t) i;
432 }
433
434 poly1305_init(&state, r, 16, s, 16);
435
436 for (int i=0; i<1024; i++)
437 poly1305_update(state, data, 1024*1024);
438
439 poly1305_destroy(state);
440 free(data);
441 }
442 #endif
443