1 /*
2 	Public domain by Andrew M. <liquidsun@gmail.com>
3 */
4 
5 
6 /*
7 	Arithmetic modulo the group order n = 2^252 +  27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
8 
9 	k = 32
10 	b = 1 << 8 = 256
11 	m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
12 	mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
13 */
14 
15 #define bignum256modm_bits_per_limb 56
16 #define bignum256modm_limb_size 5
17 
18 typedef uint64_t bignum256modm_element_t;
19 typedef bignum256modm_element_t bignum256modm[5];
20 
21 static const bignum256modm modm_m = {
22 	0x12631a5cf5d3ed,
23 	0xf9dea2f79cd658,
24 	0x000000000014de,
25 	0x00000000000000,
26 	0x00000010000000
27 };
28 
29 static const bignum256modm modm_mu = {
30 	0x9ce5a30a2c131b,
31 	0x215d086329a7ed,
32 	0xffffffffeb2106,
33 	0xffffffffffffff,
34 	0x00000fffffffff
35 };
36 
37 static bignum256modm_element_t
lt_modm(bignum256modm_element_t a,bignum256modm_element_t b)38 lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
39 	return (a - b) >> 63;
40 }
41 
42 static void
reduce256_modm(bignum256modm r)43 reduce256_modm(bignum256modm r) {
44 	bignum256modm t;
45 	bignum256modm_element_t b = 0, pb, mask;
46 
47 	/* t = r - m */
48 	pb = 0;
49 	pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 56)); pb = b;
50 	pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 56)); pb = b;
51 	pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 56)); pb = b;
52 	pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 56)); pb = b;
53 	pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 32));
54 
55 	/* keep r if r was smaller than m */
56 	mask = b - 1;
57 
58 	r[0] ^= mask & (r[0] ^ t[0]);
59 	r[1] ^= mask & (r[1] ^ t[1]);
60 	r[2] ^= mask & (r[2] ^ t[2]);
61 	r[3] ^= mask & (r[3] ^ t[3]);
62 	r[4] ^= mask & (r[4] ^ t[4]);
63 }
64 
65 static void
barrett_reduce256_modm(bignum256modm r,const bignum256modm q1,const bignum256modm r1)66 barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
67 	bignum256modm q3, r2;
68 	uint128_t c, mul;
69 	bignum256modm_element_t f, b, pb;
70 
71 	/* q1 = x >> 248 = 264 bits = 5 56 bit elements
72 	   q2 = mu * q1
73 	   q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
74 	mul64x64_128(c, modm_mu[0], q1[3])                 mul64x64_128(mul, modm_mu[3], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[2]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[1]) add128(c, mul) shr128(f, c, 56);
75 	mul64x64_128(c, modm_mu[0], q1[4]) add128_64(c, f) mul64x64_128(mul, modm_mu[4], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[1]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[2]) add128(c, mul)
76 	f = lo128(c); q3[0] = (f >> 40) & 0xffff; shr128(f, c, 56);
77 	mul64x64_128(c, modm_mu[4], q1[1]) add128_64(c, f) mul64x64_128(mul, modm_mu[1], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[2]) add128(c, mul)
78 	f = lo128(c); q3[0] |= (f << 16) & 0xffffffffffffff; q3[1] = (f >> 40) & 0xffff; shr128(f, c, 56);
79 	mul64x64_128(c, modm_mu[4], q1[2]) add128_64(c, f) mul64x64_128(mul, modm_mu[2], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[3]) add128(c, mul)
80 	f = lo128(c); q3[1] |= (f << 16) & 0xffffffffffffff; q3[2] = (f >> 40) & 0xffff; shr128(f, c, 56);
81 	mul64x64_128(c, modm_mu[4], q1[3]) add128_64(c, f) mul64x64_128(mul, modm_mu[3], q1[4]) add128(c, mul)
82 	f = lo128(c); q3[2] |= (f << 16) & 0xffffffffffffff; q3[3] = (f >> 40) & 0xffff; shr128(f, c, 56);
83 	mul64x64_128(c, modm_mu[4], q1[4]) add128_64(c, f)
84 	f = lo128(c); q3[3] |= (f << 16) & 0xffffffffffffff; q3[4] = (f >> 40) & 0xffff; shr128(f, c, 56);
85 	q3[4] |= (f << 16);
86 
87 	mul64x64_128(c, modm_m[0], q3[0])
88 	r2[0] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
89 	mul64x64_128(c, modm_m[0], q3[1]) add128_64(c, f) mul64x64_128(mul, modm_m[1], q3[0]) add128(c, mul)
90 	r2[1] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
91 	mul64x64_128(c, modm_m[0], q3[2]) add128_64(c, f) mul64x64_128(mul, modm_m[2], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[1]) add128(c, mul)
92 	r2[2] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
93 	mul64x64_128(c, modm_m[0], q3[3]) add128_64(c, f) mul64x64_128(mul, modm_m[3], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[2]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[1]) add128(c, mul)
94 	r2[3] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56);
95 	mul64x64_128(c, modm_m[0], q3[4]) add128_64(c, f) mul64x64_128(mul, modm_m[4], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[3], q3[1]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[3]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[2]) add128(c, mul)
96 	r2[4] = lo128(c) & 0x0000ffffffffff;
97 
98 	pb = 0;
99 	pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 56)); pb = b;
100 	pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 56)); pb = b;
101 	pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 56)); pb = b;
102 	pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 56)); pb = b;
103 	pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 40));
104 
105 	reduce256_modm(r);
106 	reduce256_modm(r);
107 }
108 
109 
110 static void
add256_modm(bignum256modm r,const bignum256modm x,const bignum256modm y)111 add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
112 	bignum256modm_element_t c;
113 
114 	c  = x[0] + y[0]; r[0] = c & 0xffffffffffffff; c >>= 56;
115 	c += x[1] + y[1]; r[1] = c & 0xffffffffffffff; c >>= 56;
116 	c += x[2] + y[2]; r[2] = c & 0xffffffffffffff; c >>= 56;
117 	c += x[3] + y[3]; r[3] = c & 0xffffffffffffff; c >>= 56;
118 	c += x[4] + y[4]; r[4] = c;
119 
120 	reduce256_modm(r);
121 }
122 
123 static void
mul256_modm(bignum256modm r,const bignum256modm x,const bignum256modm y)124 mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
125 	bignum256modm q1, r1;
126 	uint128_t c, mul;
127 	bignum256modm_element_t f;
128 
129 	mul64x64_128(c, x[0], y[0])
130 	f = lo128(c); r1[0] = f & 0xffffffffffffff; shr128(f, c, 56);
131 	mul64x64_128(c, x[0], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[0]) add128(c, mul)
132 	f = lo128(c); r1[1] = f & 0xffffffffffffff; shr128(f, c, 56);
133 	mul64x64_128(c, x[0], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[1]) add128(c, mul)
134 	f = lo128(c); r1[2] = f & 0xffffffffffffff; shr128(f, c, 56);
135 	mul64x64_128(c, x[0], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[2]) add128(c, mul) mul64x64_128(mul, x[2], y[1]) add128(c, mul)
136 	f = lo128(c); r1[3] = f & 0xffffffffffffff; shr128(f, c, 56);
137 	mul64x64_128(c, x[0], y[4]) add128_64(c, f) mul64x64_128(mul, x[4], y[0]) add128(c, mul) mul64x64_128(mul, x[3], y[1]) add128(c, mul) mul64x64_128(mul, x[1], y[3]) add128(c, mul) mul64x64_128(mul, x[2], y[2]) add128(c, mul)
138 	f = lo128(c); r1[4] = f & 0x0000ffffffffff; q1[0] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
139 	mul64x64_128(c, x[4], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[4]) add128(c, mul) mul64x64_128(mul, x[2], y[3]) add128(c, mul) mul64x64_128(mul, x[3], y[2]) add128(c, mul)
140 	f = lo128(c); q1[0] |= (f << 32) & 0xffffffffffffff; q1[1] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
141 	mul64x64_128(c, x[4], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[4]) add128(c, mul) mul64x64_128(mul, x[3], y[3]) add128(c, mul)
142 	f = lo128(c); q1[1] |= (f << 32) & 0xffffffffffffff; q1[2] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
143 	mul64x64_128(c, x[4], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[4]) add128(c, mul)
144 	f = lo128(c); q1[2] |= (f << 32) & 0xffffffffffffff; q1[3] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
145 	mul64x64_128(c, x[4], y[4]) add128_64(c, f)
146 	f = lo128(c); q1[3] |= (f << 32) & 0xffffffffffffff; q1[4] = (f >> 24) & 0xffffffff; shr128(f, c, 56);
147 	q1[4] |= (f << 32);
148 
149 	barrett_reduce256_modm(r, q1, r1);
150 }
151 
152 static void
expand256_modm(bignum256modm out,const unsigned char * in,size_t len)153 expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
154 	unsigned char work[64] = {0};
155 	bignum256modm_element_t x[16];
156 	bignum256modm q1;
157 
158 	memcpy(work, in, len);
159 	x[0] = U8TO64_LE(work +  0);
160 	x[1] = U8TO64_LE(work +  8);
161 	x[2] = U8TO64_LE(work + 16);
162 	x[3] = U8TO64_LE(work + 24);
163 	x[4] = U8TO64_LE(work + 32);
164 	x[5] = U8TO64_LE(work + 40);
165 	x[6] = U8TO64_LE(work + 48);
166 	x[7] = U8TO64_LE(work + 56);
167 
168 	/* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
169 	out[0] = (                         x[0]) & 0xffffffffffffff;
170 	out[1] = ((x[ 0] >> 56) | (x[ 1] <<  8)) & 0xffffffffffffff;
171 	out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
172 	out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
173 	out[4] = ((x[ 3] >> 32) | (x[ 4] << 32)) & 0x0000ffffffffff;
174 
175 	/* under 252 bits, no need to reduce */
176 	if (len < 32)
177 		return;
178 
179 	/* q1 = x >> 248 = 264 bits */
180 	q1[0] = ((x[ 3] >> 56) | (x[ 4] <<  8)) & 0xffffffffffffff;
181 	q1[1] = ((x[ 4] >> 48) | (x[ 5] << 16)) & 0xffffffffffffff;
182 	q1[2] = ((x[ 5] >> 40) | (x[ 6] << 24)) & 0xffffffffffffff;
183 	q1[3] = ((x[ 6] >> 32) | (x[ 7] << 32)) & 0xffffffffffffff;
184 	q1[4] = ((x[ 7] >> 24)                );
185 
186 	barrett_reduce256_modm(out, q1, out);
187 }
188 
189 static void
expand_raw256_modm(bignum256modm out,const unsigned char in[32])190 expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
191 	bignum256modm_element_t x[4];
192 
193 	x[0] = U8TO64_LE(in +  0);
194 	x[1] = U8TO64_LE(in +  8);
195 	x[2] = U8TO64_LE(in + 16);
196 	x[3] = U8TO64_LE(in + 24);
197 
198 	out[0] = (                         x[0]) & 0xffffffffffffff;
199 	out[1] = ((x[ 0] >> 56) | (x[ 1] <<  8)) & 0xffffffffffffff;
200 	out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff;
201 	out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff;
202 	out[4] = ((x[ 3] >> 32)                ) & 0x000000ffffffff;
203 }
204 
205 static void
contract256_modm(unsigned char out[32],const bignum256modm in)206 contract256_modm(unsigned char out[32], const bignum256modm in) {
207 	U64TO8_LE(out +  0, (in[0]      ) | (in[1] << 56));
208 	U64TO8_LE(out +  8, (in[1] >>  8) | (in[2] << 48));
209 	U64TO8_LE(out + 16, (in[2] >> 16) | (in[3] << 40));
210 	U64TO8_LE(out + 24, (in[3] >> 24) | (in[4] << 32));
211 }
212 
213 static void
contract256_window4_modm(signed char r[64],const bignum256modm in)214 contract256_window4_modm(signed char r[64], const bignum256modm in) {
215 	char carry;
216 	signed char *quads = r;
217 	bignum256modm_element_t i, j, v, m;
218 
219 	for (i = 0; i < 5; i++) {
220 		v = in[i];
221 		m = (i == 4) ? 8 : 14;
222 		for (j = 0; j < m; j++) {
223 			*quads++ = (v & 15);
224 			v >>= 4;
225 		}
226 	}
227 
228 	/* making it signed */
229 	carry = 0;
230 	for(i = 0; i < 63; i++) {
231 		r[i] += carry;
232 		r[i+1] += (r[i] >> 4);
233 		r[i] &= 15;
234 		carry = (r[i] >> 3);
235 		r[i] -= (carry << 4);
236 	}
237 	r[63] += carry;
238 }
239 
240 static void
contract256_slidingwindow_modm(signed char r[256],const bignum256modm s,int windowsize)241 contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
242 	int i,j,k,b;
243 	int m = (1 << (windowsize - 1)) - 1, soplen = 256;
244 	signed char *bits = r;
245 	bignum256modm_element_t v;
246 
247 	/* first put the binary expansion into r  */
248 	for (i = 0; i < 4; i++) {
249 		v = s[i];
250 		for (j = 0; j < 56; j++, v >>= 1)
251 			*bits++ = (v & 1);
252 	}
253 	v = s[4];
254 	for (j = 0; j < 32; j++, v >>= 1)
255 		*bits++ = (v & 1);
256 
257 	/* Making it sliding window */
258 	for (j = 0; j < soplen; j++) {
259 		if (!r[j])
260 			continue;
261 
262 		for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
263 			if ((r[j] + (r[j + b] << b)) <= m) {
264 				r[j] += r[j + b] << b;
265 				r[j + b] = 0;
266 			} else if ((r[j] - (r[j + b] << b)) >= -m) {
267 				r[j] -= r[j + b] << b;
268 				for (k = j + b; k < soplen; k++) {
269 					if (!r[k]) {
270 						r[k] = 1;
271 						break;
272 					}
273 					r[k] = 0;
274 				}
275 			} else if (r[j + b]) {
276 				break;
277 			}
278 		}
279 	}
280 }
281 
282 /*
283 	helpers for batch verifcation, are allowed to be vartime
284 */
285 #if 0
286 /* out = a - b, a must be larger than b */
287 static void
288 sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
289 	size_t i = 0;
290 	bignum256modm_element_t carry = 0;
291 	switch (limbsize) {
292 		case 4: out[i] = (a[i] - b[i])        ; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
293 		case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
294 		case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
295 		case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++;
296 		case 0:
297 		default: out[i] = (a[i] - b[i]) - carry;
298 	}
299 }
300 
301 
302 /* is a < b */
303 static int
304 lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
305 	size_t i = 0;
306 	bignum256modm_element_t t, carry = 0;
307 	switch (limbsize) {
308 		case 4: t = (a[i] - b[i])        ; carry = (t >> 63); i++;
309 		case 3: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
310 		case 2: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
311 		case 1: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++;
312 		case 0: t = (a[i] - b[i]) - carry; carry = (t >> 63);
313 	}
314 	return (int)carry;
315 }
316 
317 /* is a <= b */
318 static int
319 lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
320 	size_t i = 0;
321 	bignum256modm_element_t t, carry = 0;
322 	switch (limbsize) {
323 		case 4: t = (b[i] - a[i])        ; carry = (t >> 63); i++;
324 		case 3: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
325 		case 2: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
326 		case 1: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++;
327 		case 0: t = (b[i] - a[i]) - carry; carry = (t >> 63);
328 	}
329 	return (int)!carry;
330 }
331 
332 /* is a == 0 */
333 static int
334 iszero256_modm_batch(const bignum256modm a) {
335 	size_t i;
336 	for (i = 0; i < 5; i++)
337 		if (a[i])
338 			return 0;
339 	return 1;
340 }
341 
342 /* is a == 1 */
343 static int
344 isone256_modm_batch(const bignum256modm a) {
345 	size_t i;
346 	for (i = 0; i < 5; i++)
347 		if (a[i] != ((i) ? 0 : 1))
348 			return 0;
349 	return 1;
350 }
351 
352 /* can a fit in to (at most) 128 bits */
353 static int
354 isatmost128bits256_modm_batch(const bignum256modm a) {
355 	uint64_t mask =
356 		((a[4]                   )  | /*  32 */
357 		 (a[3]                   )  | /*  88 */
358 		 (a[2] & 0xffffffffff0000));
359 
360 	return (mask == 0);
361 }
362 #endif
363