1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty  *
4*0957b409SSimon J. Gerraty  * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty  * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty  * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty  * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty  * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty  * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty  * the following conditions:
11*0957b409SSimon J. Gerraty  *
12*0957b409SSimon J. Gerraty  * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty  * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty  *
15*0957b409SSimon J. Gerraty  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty  * SOFTWARE.
23*0957b409SSimon J. Gerraty  */
24*0957b409SSimon J. Gerraty 
25*0957b409SSimon J. Gerraty #include "inner.h"
26*0957b409SSimon J. Gerraty 
27*0957b409SSimon J. Gerraty /*
28*0957b409SSimon J. Gerraty  * Perform the inner processing of blocks for Poly1305.
29*0957b409SSimon J. Gerraty  */
30*0957b409SSimon J. Gerraty static void
poly1305_inner(uint32_t * a,const uint32_t * r,const void * data,size_t len)31*0957b409SSimon J. Gerraty poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)
32*0957b409SSimon J. Gerraty {
33*0957b409SSimon J. Gerraty 	/*
34*0957b409SSimon J. Gerraty 	 * Implementation notes: we split the 130-bit values into ten
35*0957b409SSimon J. Gerraty 	 * 13-bit words. This gives us some space for carries and allows
36*0957b409SSimon J. Gerraty 	 * using only 32x32->32 multiplications, which are way faster than
37*0957b409SSimon J. Gerraty 	 * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
38*0957b409SSimon J. Gerraty 	 * help in making constant-time code on the Cortex-M3.
39*0957b409SSimon J. Gerraty 	 *
40*0957b409SSimon J. Gerraty 	 * Since we compute modulo 2^130-5, the "upper words" become
41*0957b409SSimon J. Gerraty 	 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
42*0957b409SSimon J. Gerraty 	 * This has already been integrated in the r[] array, which
43*0957b409SSimon J. Gerraty 	 * is extended to the 0..18 range.
44*0957b409SSimon J. Gerraty 	 *
45*0957b409SSimon J. Gerraty 	 * In each loop iteration, a[] and r[] words are 13-bit each,
46*0957b409SSimon J. Gerraty 	 * except a[1] which may use 14 bits.
47*0957b409SSimon J. Gerraty 	 */
48*0957b409SSimon J. Gerraty 	const unsigned char *buf;
49*0957b409SSimon J. Gerraty 
50*0957b409SSimon J. Gerraty 	buf = data;
51*0957b409SSimon J. Gerraty 	while (len > 0) {
52*0957b409SSimon J. Gerraty 		unsigned char tmp[16];
53*0957b409SSimon J. Gerraty 		uint32_t b[10];
54*0957b409SSimon J. Gerraty 		unsigned u, v;
55*0957b409SSimon J. Gerraty 		uint32_t z, cc1, cc2;
56*0957b409SSimon J. Gerraty 
57*0957b409SSimon J. Gerraty 		/*
58*0957b409SSimon J. Gerraty 		 * If there is a partial block, right-pad it with zeros.
59*0957b409SSimon J. Gerraty 		 */
60*0957b409SSimon J. Gerraty 		if (len < 16) {
61*0957b409SSimon J. Gerraty 			memset(tmp, 0, sizeof tmp);
62*0957b409SSimon J. Gerraty 			memcpy(tmp, buf, len);
63*0957b409SSimon J. Gerraty 			buf = tmp;
64*0957b409SSimon J. Gerraty 			len = 16;
65*0957b409SSimon J. Gerraty 		}
66*0957b409SSimon J. Gerraty 
67*0957b409SSimon J. Gerraty 		/*
68*0957b409SSimon J. Gerraty 		 * Decode next block and apply the "high bit"; that value
69*0957b409SSimon J. Gerraty 		 * is added to the accumulator.
70*0957b409SSimon J. Gerraty 		 */
71*0957b409SSimon J. Gerraty 		v = br_dec16le(buf);
72*0957b409SSimon J. Gerraty 		a[0] += v & 0x01FFF;
73*0957b409SSimon J. Gerraty 		v >>= 13;
74*0957b409SSimon J. Gerraty 		v |= buf[2] << 3;
75*0957b409SSimon J. Gerraty 		v |= buf[3] << 11;
76*0957b409SSimon J. Gerraty 		a[1] += v & 0x01FFF;
77*0957b409SSimon J. Gerraty 		v >>= 13;
78*0957b409SSimon J. Gerraty 		v |= buf[4] << 6;
79*0957b409SSimon J. Gerraty 		a[2] += v & 0x01FFF;
80*0957b409SSimon J. Gerraty 		v >>= 13;
81*0957b409SSimon J. Gerraty 		v |= buf[5] << 1;
82*0957b409SSimon J. Gerraty 		v |= buf[6] << 9;
83*0957b409SSimon J. Gerraty 		a[3] += v & 0x01FFF;
84*0957b409SSimon J. Gerraty 		v >>= 13;
85*0957b409SSimon J. Gerraty 		v |= buf[7] << 4;
86*0957b409SSimon J. Gerraty 		v |= buf[8] << 12;
87*0957b409SSimon J. Gerraty 		a[4] += v & 0x01FFF;
88*0957b409SSimon J. Gerraty 		v >>= 13;
89*0957b409SSimon J. Gerraty 		v |= buf[9] << 7;
90*0957b409SSimon J. Gerraty 		a[5] += v & 0x01FFF;
91*0957b409SSimon J. Gerraty 		v >>= 13;
92*0957b409SSimon J. Gerraty 		v |= buf[10] << 2;
93*0957b409SSimon J. Gerraty 		v |= buf[11] << 10;
94*0957b409SSimon J. Gerraty 		a[6] += v & 0x01FFF;
95*0957b409SSimon J. Gerraty 		v >>= 13;
96*0957b409SSimon J. Gerraty 		v |= buf[12] << 5;
97*0957b409SSimon J. Gerraty 		a[7] += v & 0x01FFF;
98*0957b409SSimon J. Gerraty 		v = br_dec16le(buf + 13);
99*0957b409SSimon J. Gerraty 		a[8] += v & 0x01FFF;
100*0957b409SSimon J. Gerraty 		v >>= 13;
101*0957b409SSimon J. Gerraty 		v |= buf[15] << 3;
102*0957b409SSimon J. Gerraty 		a[9] += v | 0x00800;
103*0957b409SSimon J. Gerraty 
104*0957b409SSimon J. Gerraty 		/*
105*0957b409SSimon J. Gerraty 		 * At that point, all a[] values fit on 14 bits, while
106*0957b409SSimon J. Gerraty 		 * all r[] values fit on 13 bits. Thus products fit on
107*0957b409SSimon J. Gerraty 		 * 27 bits, and we can accumulate up to 31 of them in
108*0957b409SSimon J. Gerraty 		 * a 32-bit word and still have some room for carries.
109*0957b409SSimon J. Gerraty 		 */
110*0957b409SSimon J. Gerraty 
111*0957b409SSimon J. Gerraty 		/*
112*0957b409SSimon J. Gerraty 		 * Now a[] contains words with values up to 14 bits each.
113*0957b409SSimon J. Gerraty 		 * We perform the multiplication with r[].
114*0957b409SSimon J. Gerraty 		 *
115*0957b409SSimon J. Gerraty 		 * The extended words of r[] may be larger than 13 bits
116*0957b409SSimon J. Gerraty 		 * (they are 5 times a 13-bit word) so the full summation
117*0957b409SSimon J. Gerraty 		 * may yield values up to 46 times a 27-bit word, which
118*0957b409SSimon J. Gerraty 		 * does not fit on a 32-bit word. To avoid that issue, we
119*0957b409SSimon J. Gerraty 		 * must split the loop below in two, with a carry
120*0957b409SSimon J. Gerraty 		 * propagation operation in the middle.
121*0957b409SSimon J. Gerraty 		 */
122*0957b409SSimon J. Gerraty 		cc1 = 0;
123*0957b409SSimon J. Gerraty 		for (u = 0; u < 10; u ++) {
124*0957b409SSimon J. Gerraty 			uint32_t s;
125*0957b409SSimon J. Gerraty 
126*0957b409SSimon J. Gerraty 			s = cc1
127*0957b409SSimon J. Gerraty 				+ MUL15(a[0], r[u + 9 - 0])
128*0957b409SSimon J. Gerraty 				+ MUL15(a[1], r[u + 9 - 1])
129*0957b409SSimon J. Gerraty 				+ MUL15(a[2], r[u + 9 - 2])
130*0957b409SSimon J. Gerraty 				+ MUL15(a[3], r[u + 9 - 3])
131*0957b409SSimon J. Gerraty 				+ MUL15(a[4], r[u + 9 - 4]);
132*0957b409SSimon J. Gerraty 			b[u] = s & 0x1FFF;
133*0957b409SSimon J. Gerraty 			cc1 = s >> 13;
134*0957b409SSimon J. Gerraty 		}
135*0957b409SSimon J. Gerraty 		cc2 = 0;
136*0957b409SSimon J. Gerraty 		for (u = 0; u < 10; u ++) {
137*0957b409SSimon J. Gerraty 			uint32_t s;
138*0957b409SSimon J. Gerraty 
139*0957b409SSimon J. Gerraty 			s = b[u] + cc2
140*0957b409SSimon J. Gerraty 				+ MUL15(a[5], r[u + 9 - 5])
141*0957b409SSimon J. Gerraty 				+ MUL15(a[6], r[u + 9 - 6])
142*0957b409SSimon J. Gerraty 				+ MUL15(a[7], r[u + 9 - 7])
143*0957b409SSimon J. Gerraty 				+ MUL15(a[8], r[u + 9 - 8])
144*0957b409SSimon J. Gerraty 				+ MUL15(a[9], r[u + 9 - 9]);
145*0957b409SSimon J. Gerraty 			b[u] = s & 0x1FFF;
146*0957b409SSimon J. Gerraty 			cc2 = s >> 13;
147*0957b409SSimon J. Gerraty 		}
148*0957b409SSimon J. Gerraty 		memcpy(a, b, sizeof b);
149*0957b409SSimon J. Gerraty 
150*0957b409SSimon J. Gerraty 		/*
151*0957b409SSimon J. Gerraty 		 * The two carries "loop back" with a factor of 5. We
152*0957b409SSimon J. Gerraty 		 * propagate them into a[0] and a[1].
153*0957b409SSimon J. Gerraty 		 */
154*0957b409SSimon J. Gerraty 		z = cc1 + cc2;
155*0957b409SSimon J. Gerraty 		z += (z << 2) + a[0];
156*0957b409SSimon J. Gerraty 		a[0] = z & 0x1FFF;
157*0957b409SSimon J. Gerraty 		a[1] += z >> 13;
158*0957b409SSimon J. Gerraty 
159*0957b409SSimon J. Gerraty 		buf += 16;
160*0957b409SSimon J. Gerraty 		len -= 16;
161*0957b409SSimon J. Gerraty 	}
162*0957b409SSimon J. Gerraty }
163*0957b409SSimon J. Gerraty 
164*0957b409SSimon J. Gerraty /* see bearssl_block.h */
165*0957b409SSimon J. Gerraty void
br_poly1305_ctmul32_run(const void * key,const void * iv,void * data,size_t len,const void * aad,size_t aad_len,void * tag,br_chacha20_run ichacha,int encrypt)166*0957b409SSimon J. Gerraty br_poly1305_ctmul32_run(const void *key, const void *iv,
167*0957b409SSimon J. Gerraty 	void *data, size_t len, const void *aad, size_t aad_len,
168*0957b409SSimon J. Gerraty 	void *tag, br_chacha20_run ichacha, int encrypt)
169*0957b409SSimon J. Gerraty {
170*0957b409SSimon J. Gerraty 	unsigned char pkey[32], foot[16];
171*0957b409SSimon J. Gerraty 	uint32_t z, r[19], acc[10], cc, ctl;
172*0957b409SSimon J. Gerraty 	int i;
173*0957b409SSimon J. Gerraty 
174*0957b409SSimon J. Gerraty 	/*
175*0957b409SSimon J. Gerraty 	 * Compute the MAC key. The 'r' value is the first 16 bytes of
176*0957b409SSimon J. Gerraty 	 * pkey[].
177*0957b409SSimon J. Gerraty 	 */
178*0957b409SSimon J. Gerraty 	memset(pkey, 0, sizeof pkey);
179*0957b409SSimon J. Gerraty 	ichacha(key, iv, 0, pkey, sizeof pkey);
180*0957b409SSimon J. Gerraty 
181*0957b409SSimon J. Gerraty 	/*
182*0957b409SSimon J. Gerraty 	 * If encrypting, ChaCha20 must run first, followed by Poly1305.
183*0957b409SSimon J. Gerraty 	 * When decrypting, the operations are reversed.
184*0957b409SSimon J. Gerraty 	 */
185*0957b409SSimon J. Gerraty 	if (encrypt) {
186*0957b409SSimon J. Gerraty 		ichacha(key, iv, 1, data, len);
187*0957b409SSimon J. Gerraty 	}
188*0957b409SSimon J. Gerraty 
189*0957b409SSimon J. Gerraty 	/*
190*0957b409SSimon J. Gerraty 	 * Run Poly1305. We must process the AAD, then ciphertext, then
191*0957b409SSimon J. Gerraty 	 * the footer (with the lengths). Note that the AAD and ciphertext
192*0957b409SSimon J. Gerraty 	 * are meant to be padded with zeros up to the next multiple of 16,
193*0957b409SSimon J. Gerraty 	 * and the length of the footer is 16 bytes as well.
194*0957b409SSimon J. Gerraty 	 */
195*0957b409SSimon J. Gerraty 
196*0957b409SSimon J. Gerraty 	/*
197*0957b409SSimon J. Gerraty 	 * Decode the 'r' value into 13-bit words, with the "clamping"
198*0957b409SSimon J. Gerraty 	 * operation applied.
199*0957b409SSimon J. Gerraty 	 */
200*0957b409SSimon J. Gerraty 	z = br_dec32le(pkey) & 0x03FFFFFF;
201*0957b409SSimon J. Gerraty 	r[9] = z & 0x1FFF;
202*0957b409SSimon J. Gerraty 	r[10] = z >> 13;
203*0957b409SSimon J. Gerraty 	z = (br_dec32le(pkey +  3) >> 2) & 0x03FFFF03;
204*0957b409SSimon J. Gerraty 	r[11] = z & 0x1FFF;
205*0957b409SSimon J. Gerraty 	r[12] = z >> 13;
206*0957b409SSimon J. Gerraty 	z = (br_dec32le(pkey +  6) >> 4) & 0x03FFC0FF;
207*0957b409SSimon J. Gerraty 	r[13] = z & 0x1FFF;
208*0957b409SSimon J. Gerraty 	r[14] = z >> 13;
209*0957b409SSimon J. Gerraty 	z = (br_dec32le(pkey +  9) >> 6) & 0x03F03FFF;
210*0957b409SSimon J. Gerraty 	r[15] = z & 0x1FFF;
211*0957b409SSimon J. Gerraty 	r[16] = z >> 13;
212*0957b409SSimon J. Gerraty 	z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
213*0957b409SSimon J. Gerraty 	r[17] = z & 0x1FFF;
214*0957b409SSimon J. Gerraty 	r[18] = z >> 13;
215*0957b409SSimon J. Gerraty 
216*0957b409SSimon J. Gerraty 	/*
217*0957b409SSimon J. Gerraty 	 * Extend r[] with the 5x factor pre-applied.
218*0957b409SSimon J. Gerraty 	 */
219*0957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
220*0957b409SSimon J. Gerraty 		r[i] = MUL15(5, r[i + 10]);
221*0957b409SSimon J. Gerraty 	}
222*0957b409SSimon J. Gerraty 
223*0957b409SSimon J. Gerraty 	/*
224*0957b409SSimon J. Gerraty 	 * Accumulator is 0.
225*0957b409SSimon J. Gerraty 	 */
226*0957b409SSimon J. Gerraty 	memset(acc, 0, sizeof acc);
227*0957b409SSimon J. Gerraty 
228*0957b409SSimon J. Gerraty 	/*
229*0957b409SSimon J. Gerraty 	 * Process the additional authenticated data, ciphertext, and
230*0957b409SSimon J. Gerraty 	 * footer in due order.
231*0957b409SSimon J. Gerraty 	 */
232*0957b409SSimon J. Gerraty 	br_enc64le(foot, (uint64_t)aad_len);
233*0957b409SSimon J. Gerraty 	br_enc64le(foot + 8, (uint64_t)len);
234*0957b409SSimon J. Gerraty 	poly1305_inner(acc, r, aad, aad_len);
235*0957b409SSimon J. Gerraty 	poly1305_inner(acc, r, data, len);
236*0957b409SSimon J. Gerraty 	poly1305_inner(acc, r, foot, sizeof foot);
237*0957b409SSimon J. Gerraty 
238*0957b409SSimon J. Gerraty 	/*
239*0957b409SSimon J. Gerraty 	 * Finalise modular reduction. This is done with carry propagation
240*0957b409SSimon J. Gerraty 	 * and applying the '2^130 = -5 mod p' rule. Note that the output
241*0957b409SSimon J. Gerraty 	 * of poly1035_inner() is already mostly reduced, since only
242*0957b409SSimon J. Gerraty 	 * acc[1] may be (very slightly) above 2^13. A single loop back
243*0957b409SSimon J. Gerraty 	 * to acc[1] will be enough to make the value fit in 130 bits.
244*0957b409SSimon J. Gerraty 	 */
245*0957b409SSimon J. Gerraty 	cc = 0;
246*0957b409SSimon J. Gerraty 	for (i = 1; i < 10; i ++) {
247*0957b409SSimon J. Gerraty 		z = acc[i] + cc;
248*0957b409SSimon J. Gerraty 		acc[i] = z & 0x1FFF;
249*0957b409SSimon J. Gerraty 		cc = z >> 13;
250*0957b409SSimon J. Gerraty 	}
251*0957b409SSimon J. Gerraty 	z = acc[0] + cc + (cc << 2);
252*0957b409SSimon J. Gerraty 	acc[0] = z & 0x1FFF;
253*0957b409SSimon J. Gerraty 	acc[1] += z >> 13;
254*0957b409SSimon J. Gerraty 
255*0957b409SSimon J. Gerraty 	/*
256*0957b409SSimon J. Gerraty 	 * We may still have a value in the 2^130-5..2^130-1 range, in
257*0957b409SSimon J. Gerraty 	 * which case we must reduce it again. The code below selects,
258*0957b409SSimon J. Gerraty 	 * in constant-time, between 'acc' and 'acc-p',
259*0957b409SSimon J. Gerraty 	 */
260*0957b409SSimon J. Gerraty 	ctl = GT(acc[0], 0x1FFA);
261*0957b409SSimon J. Gerraty 	for (i = 1; i < 10; i ++) {
262*0957b409SSimon J. Gerraty 		ctl &= EQ(acc[i], 0x1FFF);
263*0957b409SSimon J. Gerraty 	}
264*0957b409SSimon J. Gerraty 	acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
265*0957b409SSimon J. Gerraty 	for (i = 1; i < 10; i ++) {
266*0957b409SSimon J. Gerraty 		acc[i] &= ~(-ctl);
267*0957b409SSimon J. Gerraty 	}
268*0957b409SSimon J. Gerraty 
269*0957b409SSimon J. Gerraty 	/*
270*0957b409SSimon J. Gerraty 	 * Convert back the accumulator to 32-bit words, and add the
271*0957b409SSimon J. Gerraty 	 * 's' value (second half of pkey[]). That addition is done
272*0957b409SSimon J. Gerraty 	 * modulo 2^128.
273*0957b409SSimon J. Gerraty 	 */
274*0957b409SSimon J. Gerraty 	z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
275*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag, z & 0xFFFF);
276*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
277*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
278*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
279*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
280*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
281*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
282*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
283*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
284*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
285*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
286*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
287*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
288*0957b409SSimon J. Gerraty 	z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
289*0957b409SSimon J. Gerraty 	br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
290*0957b409SSimon J. Gerraty 
291*0957b409SSimon J. Gerraty 	/*
292*0957b409SSimon J. Gerraty 	 * If decrypting, then ChaCha20 runs _after_ Poly1305.
293*0957b409SSimon J. Gerraty 	 */
294*0957b409SSimon J. Gerraty 	if (!encrypt) {
295*0957b409SSimon J. Gerraty 		ichacha(key, iv, 1, data, len);
296*0957b409SSimon J. Gerraty 	}
297*0957b409SSimon J. Gerraty }
298