1 /*
2 	Public domain by Andrew M. <liquidsun@gmail.com>
3 */
4 
5 
6 /*
7 	Arithmetic modulo the group order n = 2^252 +  27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
8 
9 	k = 32
10 	b = 1 << 8 = 256
11 	m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
12 	mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
13 */
14 
15 #define bignum256modm_bits_per_limb 30
16 #define bignum256modm_limb_size 9
17 
18 typedef uint32_t bignum256modm_element_t;
19 typedef bignum256modm_element_t bignum256modm[9];
20 
21 static const bignum256modm modm_m = {
22 	0x1cf5d3ed, 0x20498c69, 0x2f79cd65, 0x37be77a8,
23 	0x00000014,	0x00000000, 0x00000000,	0x00000000,
24 	0x00001000
25 };
26 
27 static const bignum256modm modm_mu = {
28 	0x0a2c131b, 0x3673968c, 0x06329a7e, 0x01885742,
29 	0x3fffeb21, 0x3fffffff, 0x3fffffff, 0x3fffffff,
30 	0x000fffff
31 };
32 
33 static bignum256modm_element_t
lt_modm(bignum256modm_element_t a,bignum256modm_element_t b)34 lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
35 	return (a - b) >> 31;
36 }
37 
38 /* see HAC, Alg. 14.42 Step 4 */
39 static void
reduce256_modm(bignum256modm r)40 reduce256_modm(bignum256modm r) {
41 	bignum256modm t;
42 	bignum256modm_element_t b = 0, pb, mask;
43 
44 	/* t = r - m */
45 	pb = 0;
46 	pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 30)); pb = b;
47 	pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 30)); pb = b;
48 	pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 30)); pb = b;
49 	pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 30)); pb = b;
50 	pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 30)); pb = b;
51 	pb += modm_m[5]; b = lt_modm(r[5], pb); t[5] = (r[5] - pb + (b << 30)); pb = b;
52 	pb += modm_m[6]; b = lt_modm(r[6], pb); t[6] = (r[6] - pb + (b << 30)); pb = b;
53 	pb += modm_m[7]; b = lt_modm(r[7], pb); t[7] = (r[7] - pb + (b << 30)); pb = b;
54 	pb += modm_m[8]; b = lt_modm(r[8], pb); t[8] = (r[8] - pb + (b << 16));
55 
56 	/* keep r if r was smaller than m */
57 	mask = b - 1;
58 	r[0] ^= mask & (r[0] ^ t[0]);
59 	r[1] ^= mask & (r[1] ^ t[1]);
60 	r[2] ^= mask & (r[2] ^ t[2]);
61 	r[3] ^= mask & (r[3] ^ t[3]);
62 	r[4] ^= mask & (r[4] ^ t[4]);
63 	r[5] ^= mask & (r[5] ^ t[5]);
64 	r[6] ^= mask & (r[6] ^ t[6]);
65 	r[7] ^= mask & (r[7] ^ t[7]);
66 	r[8] ^= mask & (r[8] ^ t[8]);
67 }
68 
69 /*
70 	Barrett reduction,  see HAC, Alg. 14.42
71 
72 	Instead of passing in x, pre-process in to q1 and r1 for efficiency
73 */
74 static void
barrett_reduce256_modm(bignum256modm r,const bignum256modm q1,const bignum256modm r1)75 barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
76 	bignum256modm q3, r2;
77 	uint64_t c;
78 	bignum256modm_element_t f, b, pb;
79 
80 	/* q1 = x >> 248 = 264 bits = 9 30 bit elements
81 	   q2 = mu * q1
82 	   q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
83 	c  = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) + mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) + mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) + mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]);
84 	c >>= 30;
85 	c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) + mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) + mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) + mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) + mul32x32_64(modm_mu[8], q1[0]);
86 	f = (bignum256modm_element_t)c; q3[0] = (f >> 24) & 0x3f; c >>= 30;
87 	c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) + mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) + mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) + mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]);
88 	f = (bignum256modm_element_t)c; q3[0] |= (f << 6) & 0x3fffffff; q3[1] = (f >> 24) & 0x3f; c >>= 30;
89 	c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) + mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) + mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) + mul32x32_64(modm_mu[8], q1[2]);
90 	f = (bignum256modm_element_t)c; q3[1] |= (f << 6) & 0x3fffffff; q3[2] = (f >> 24) & 0x3f; c >>= 30;
91 	c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) + mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) + mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]);
92 	f = (bignum256modm_element_t)c; q3[2] |= (f << 6) & 0x3fffffff; q3[3] = (f >> 24) & 0x3f; c >>= 30;
93 	c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) + mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) + mul32x32_64(modm_mu[8], q1[4]);
94 	f = (bignum256modm_element_t)c; q3[3] |= (f << 6) & 0x3fffffff; q3[4] = (f >> 24) & 0x3f; c >>= 30;
95 	c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) + mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]);
96 	f = (bignum256modm_element_t)c; q3[4] |= (f << 6) & 0x3fffffff; q3[5] = (f >> 24) & 0x3f; c >>= 30;
97 	c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) + mul32x32_64(modm_mu[8], q1[6]);
98 	f = (bignum256modm_element_t)c; q3[5] |= (f << 6) & 0x3fffffff; q3[6] = (f >> 24) & 0x3f; c >>= 30;
99 	c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]);
100 	f = (bignum256modm_element_t)c; q3[6] |= (f << 6) & 0x3fffffff; q3[7] = (f >> 24) & 0x3f; c >>= 30;
101 	c += mul32x32_64(modm_mu[8], q1[8]);
102 	f = (bignum256modm_element_t)c; q3[7] |= (f << 6) & 0x3fffffff; q3[8] = (bignum256modm_element_t)(c >> 24);
103 
104 	/* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
105 	   r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */
106 	c = mul32x32_64(modm_m[0], q3[0]);
107 	r2[0] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
108 	c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]);
109 	r2[1] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
110 	c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) + mul32x32_64(modm_m[2], q3[0]);
111 	r2[2] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
112 	c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) + mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]);
113 	r2[3] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
114 	c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) + mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) + mul32x32_64(modm_m[4], q3[0]);
115 	r2[4] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
116 	c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) + mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) + mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]);
117 	r2[5] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
118 	c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) + mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) + mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) + mul32x32_64(modm_m[6], q3[0]);
119 	r2[6] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
120 	c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) + mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) + mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) + mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]);
121 	r2[7] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
122 	c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) + mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) + mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) + mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) + mul32x32_64(modm_m[8], q3[0]);
123 	r2[8] = (bignum256modm_element_t)(c & 0xffffff);
124 
125 	/* r = r1 - r2
126 	   if (r < 0) r += (1 << 264) */
127 	pb = 0;
128 	pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 30)); pb = b;
129 	pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 30)); pb = b;
130 	pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 30)); pb = b;
131 	pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 30)); pb = b;
132 	pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 30)); pb = b;
133 	pb += r2[5]; b = lt_modm(r1[5], pb); r[5] = (r1[5] - pb + (b << 30)); pb = b;
134 	pb += r2[6]; b = lt_modm(r1[6], pb); r[6] = (r1[6] - pb + (b << 30)); pb = b;
135 	pb += r2[7]; b = lt_modm(r1[7], pb); r[7] = (r1[7] - pb + (b << 30)); pb = b;
136 	pb += r2[8]; b = lt_modm(r1[8], pb); r[8] = (r1[8] - pb + (b << 24));
137 
138 	reduce256_modm(r);
139 	reduce256_modm(r);
140 }
141 
142 /* addition modulo m */
143 static void
add256_modm(bignum256modm r,const bignum256modm x,const bignum256modm y)144 add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
145 	bignum256modm_element_t c;
146 
147 	c  = x[0] + y[0]; r[0] = c & 0x3fffffff; c >>= 30;
148 	c += x[1] + y[1]; r[1] = c & 0x3fffffff; c >>= 30;
149 	c += x[2] + y[2]; r[2] = c & 0x3fffffff; c >>= 30;
150 	c += x[3] + y[3]; r[3] = c & 0x3fffffff; c >>= 30;
151 	c += x[4] + y[4]; r[4] = c & 0x3fffffff; c >>= 30;
152 	c += x[5] + y[5]; r[5] = c & 0x3fffffff; c >>= 30;
153 	c += x[6] + y[6]; r[6] = c & 0x3fffffff; c >>= 30;
154 	c += x[7] + y[7]; r[7] = c & 0x3fffffff; c >>= 30;
155 	c += x[8] + y[8]; r[8] = c;
156 
157 	reduce256_modm(r);
158 }
159 
160 /* multiplication modulo m */
161 static void
mul256_modm(bignum256modm r,const bignum256modm x,const bignum256modm y)162 mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
163 	bignum256modm r1, q1;
164 	uint64_t c;
165 	bignum256modm_element_t f;
166 
167 	/* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
168 	   q1 = x >> 248 = 264 bits = 9 30 bit elements */
169 	c = mul32x32_64(x[0], y[0]);
170 	f = (bignum256modm_element_t)c; r1[0] = (f & 0x3fffffff); c >>= 30;
171 	c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]);
172 	f = (bignum256modm_element_t)c; r1[1] = (f & 0x3fffffff); c >>= 30;
173 	c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]);
174 	f = (bignum256modm_element_t)c; r1[2] = (f & 0x3fffffff); c >>= 30;
175 	c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) + mul32x32_64(x[3], y[0]);
176 	f = (bignum256modm_element_t)c; r1[3] = (f & 0x3fffffff); c >>= 30;
177 	c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) + mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]);
178 	f = (bignum256modm_element_t)c; r1[4] = (f & 0x3fffffff); c >>= 30;
179 	c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) + mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]);
180 	f = (bignum256modm_element_t)c; r1[5] = (f & 0x3fffffff); c >>= 30;
181 	c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) + mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) + mul32x32_64(x[6], y[0]);
182 	f = (bignum256modm_element_t)c; r1[6] = (f & 0x3fffffff); c >>= 30;
183 	c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) + mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) + mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]);
184 	f = (bignum256modm_element_t)c; r1[7] = (f & 0x3fffffff); c >>= 30;
185 	c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) + mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) + mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]);
186 	f = (bignum256modm_element_t)c; r1[8] = (f & 0x00ffffff); q1[0] = (f >> 8) & 0x3fffff; c >>= 30;
187 	c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) + mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) + mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]);
188 	f = (bignum256modm_element_t)c; q1[0] = (q1[0] | (f << 22)) & 0x3fffffff; q1[1] = (f >> 8) & 0x3fffff; c >>= 30;
189 	c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) + mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) + mul32x32_64(x[8], y[2]);
190 	f = (bignum256modm_element_t)c; q1[1] = (q1[1] | (f << 22)) & 0x3fffffff; q1[2] = (f >> 8) & 0x3fffff; c >>= 30;
191 	c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) + mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]);
192 	f = (bignum256modm_element_t)c; q1[2] = (q1[2] | (f << 22)) & 0x3fffffff; q1[3] = (f >> 8) & 0x3fffff; c >>= 30;
193 	c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) + mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]);
194 	f = (bignum256modm_element_t)c; q1[3] = (q1[3] | (f << 22)) & 0x3fffffff; q1[4] = (f >> 8) & 0x3fffff; c >>= 30;
195 	c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) + mul32x32_64(x[8], y[5]);
196 	f = (bignum256modm_element_t)c; q1[4] = (q1[4] | (f << 22)) & 0x3fffffff; q1[5] = (f >> 8) & 0x3fffff; c >>= 30;
197 	c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]);
198 	f = (bignum256modm_element_t)c; q1[5] = (q1[5] | (f << 22)) & 0x3fffffff; q1[6] = (f >> 8) & 0x3fffff; c >>= 30;
199 	c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]);
200 	f = (bignum256modm_element_t)c; q1[6] = (q1[6] | (f << 22)) & 0x3fffffff; q1[7] = (f >> 8) & 0x3fffff; c >>= 30;
201 	c += mul32x32_64(x[8], y[8]);
202 	f = (bignum256modm_element_t)c; q1[7] = (q1[7] | (f << 22)) & 0x3fffffff; q1[8] = (f >> 8) & 0x3fffff;
203 
204 	barrett_reduce256_modm(r, q1, r1);
205 }
206 
207 static void
expand256_modm(bignum256modm out,const unsigned char * in,size_t len)208 expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
209 	unsigned char work[64] = {0};
210 	bignum256modm_element_t x[16];
211 	bignum256modm q1;
212 
213 	memcpy(work, in, len);
214 	x[0] = U8TO32_LE(work +  0);
215 	x[1] = U8TO32_LE(work +  4);
216 	x[2] = U8TO32_LE(work +  8);
217 	x[3] = U8TO32_LE(work + 12);
218 	x[4] = U8TO32_LE(work + 16);
219 	x[5] = U8TO32_LE(work + 20);
220 	x[6] = U8TO32_LE(work + 24);
221 	x[7] = U8TO32_LE(work + 28);
222 	x[8] = U8TO32_LE(work + 32);
223 	x[9] = U8TO32_LE(work + 36);
224 	x[10] = U8TO32_LE(work + 40);
225 	x[11] = U8TO32_LE(work + 44);
226 	x[12] = U8TO32_LE(work + 48);
227 	x[13] = U8TO32_LE(work + 52);
228 	x[14] = U8TO32_LE(work + 56);
229 	x[15] = U8TO32_LE(work + 60);
230 
231 	/* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
232 	out[0] = (                         x[0]) & 0x3fffffff;
233 	out[1] = ((x[ 0] >> 30) | (x[ 1] <<  2)) & 0x3fffffff;
234 	out[2] = ((x[ 1] >> 28) | (x[ 2] <<  4)) & 0x3fffffff;
235 	out[3] = ((x[ 2] >> 26) | (x[ 3] <<  6)) & 0x3fffffff;
236 	out[4] = ((x[ 3] >> 24) | (x[ 4] <<  8)) & 0x3fffffff;
237 	out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
238 	out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
239 	out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
240 	out[8] = ((x[ 7] >> 16) | (x[ 8] << 16)) & 0x00ffffff;
241 
242 	/* 8*31 = 248 bits, no need to reduce */
243 	if (len < 32)
244 		return;
245 
246 	/* q1 = x >> 248 = 264 bits = 9 30 bit elements */
247 	q1[0] = ((x[ 7] >> 24) | (x[ 8] <<  8)) & 0x3fffffff;
248 	q1[1] = ((x[ 8] >> 22) | (x[ 9] << 10)) & 0x3fffffff;
249 	q1[2] = ((x[ 9] >> 20) | (x[10] << 12)) & 0x3fffffff;
250 	q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff;
251 	q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff;
252 	q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff;
253 	q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff;
254 	q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff;
255 	q1[8] = ((x[15] >>  8)                );
256 
257 	barrett_reduce256_modm(out, q1, out);
258 }
259 
260 static void
expand_raw256_modm(bignum256modm out,const unsigned char in[32])261 expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
262 	bignum256modm_element_t x[8];
263 
264 	x[0] = U8TO32_LE(in +  0);
265 	x[1] = U8TO32_LE(in +  4);
266 	x[2] = U8TO32_LE(in +  8);
267 	x[3] = U8TO32_LE(in + 12);
268 	x[4] = U8TO32_LE(in + 16);
269 	x[5] = U8TO32_LE(in + 20);
270 	x[6] = U8TO32_LE(in + 24);
271 	x[7] = U8TO32_LE(in + 28);
272 
273 	out[0] = (                         x[0]) & 0x3fffffff;
274 	out[1] = ((x[ 0] >> 30) | (x[ 1] <<  2)) & 0x3fffffff;
275 	out[2] = ((x[ 1] >> 28) | (x[ 2] <<  4)) & 0x3fffffff;
276 	out[3] = ((x[ 2] >> 26) | (x[ 3] <<  6)) & 0x3fffffff;
277 	out[4] = ((x[ 3] >> 24) | (x[ 4] <<  8)) & 0x3fffffff;
278 	out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
279 	out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
280 	out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
281 	out[8] = ((x[ 7] >> 16)                ) & 0x0000ffff;
282 }
283 
284 static void
contract256_modm(unsigned char out[32],const bignum256modm in)285 contract256_modm(unsigned char out[32], const bignum256modm in) {
286 	U32TO8_LE(out +  0, (in[0]      ) | (in[1] << 30));
287 	U32TO8_LE(out +  4, (in[1] >>  2) | (in[2] << 28));
288 	U32TO8_LE(out +  8, (in[2] >>  4) | (in[3] << 26));
289 	U32TO8_LE(out + 12, (in[3] >>  6) | (in[4] << 24));
290 	U32TO8_LE(out + 16, (in[4] >>  8) | (in[5] << 22));
291 	U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20));
292 	U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18));
293 	U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16));
294 }
295 
296 
297 
298 static void
contract256_window4_modm(signed char r[64],const bignum256modm in)299 contract256_window4_modm(signed char r[64], const bignum256modm in) {
300 	char carry;
301 	signed char *quads = r;
302 	bignum256modm_element_t i, j, v;
303 
304 	for (i = 0; i < 8; i += 2) {
305 		v = in[i];
306 		for (j = 0; j < 7; j++) {
307 			*quads++ = (v & 15);
308 			v >>= 4;
309 		}
310 		v |= (in[i+1] << 2);
311 		for (j = 0; j < 8; j++) {
312 			*quads++ = (v & 15);
313 			v >>= 4;
314 		}
315 	}
316 	v = in[8];
317 	*quads++ = (v & 15); v >>= 4;
318 	*quads++ = (v & 15); v >>= 4;
319 	*quads++ = (v & 15); v >>= 4;
320 	*quads++ = (v & 15); v >>= 4;
321 
322 	/* making it signed */
323 	carry = 0;
324 	for(i = 0; i < 63; i++) {
325 		r[i] += carry;
326 		r[i+1] += (r[i] >> 4);
327 		r[i] &= 15;
328 		carry = (r[i] >> 3);
329 		r[i] -= (carry << 4);
330 	}
331 	r[63] += carry;
332 }
333 
334 static void
contract256_slidingwindow_modm(signed char r[256],const bignum256modm s,int windowsize)335 contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
336 	int i,j,k,b;
337 	int m = (1 << (windowsize - 1)) - 1, soplen = 256;
338 	signed char *bits = r;
339 	bignum256modm_element_t v;
340 
341 	/* first put the binary expansion into r  */
342 	for (i = 0; i < 8; i++) {
343 		v = s[i];
344 		for (j = 0; j < 30; j++, v >>= 1)
345 			*bits++ = (v & 1);
346 	}
347 	v = s[8];
348 	for (j = 0; j < 16; j++, v >>= 1)
349 		*bits++ = (v & 1);
350 
351 	/* Making it sliding window */
352 	for (j = 0; j < soplen; j++) {
353 		if (!r[j])
354 			continue;
355 
356 		for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
357 			if ((r[j] + (r[j + b] << b)) <= m) {
358 				r[j] += r[j + b] << b;
359 				r[j + b] = 0;
360 			} else if ((r[j] - (r[j + b] << b)) >= -m) {
361 				r[j] -= r[j + b] << b;
362 				for (k = j + b; k < soplen; k++) {
363 					if (!r[k]) {
364 						r[k] = 1;
365 						break;
366 					}
367 					r[k] = 0;
368 				}
369 			} else if (r[j + b]) {
370 				break;
371 			}
372 		}
373 	}
374 }
375 
376 
377 /*
378 	helpers for batch verifcation, are allowed to be vartime
379 */
380 
381 /* out = a - b, a must be larger than b */
382 static void
sub256_modm_batch(bignum256modm out,const bignum256modm a,const bignum256modm b,size_t limbsize)383 sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
384 	size_t i = 0;
385 	bignum256modm_element_t carry = 0;
386 	switch (limbsize) {
387 		case 8: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
388 		case 7: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
389 		case 6: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
390 		case 5: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
391 		case 4: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
392 		case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
393 		case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
394 		case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++;
395 		case 0:
396 		default: out[i] = (a[i] - b[i]) - carry;
397 	}
398 }
399 
400 
401 /* is a < b */
402 static int
lt256_modm_batch(const bignum256modm a,const bignum256modm b,size_t limbsize)403 lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
404 	switch (limbsize) {
405 		case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1;
406 		case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1;
407 		case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1;
408 		case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1;
409 		case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1;
410 		case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1;
411 		case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1;
412 		case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1;
413 		case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
414 	}
415 	return 0;
416 }
417 
418 /* is a <= b */
419 static int
lte256_modm_batch(const bignum256modm a,const bignum256modm b,size_t limbsize)420 lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
421 	switch (limbsize) {
422 		case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1;
423 		case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1;
424 		case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1;
425 		case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1;
426 		case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1;
427 		case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1;
428 		case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1;
429 		case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1;
430 		case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
431 	}
432 	return 1;
433 }
434 
435 
436 /* is a == 0 */
437 static int
iszero256_modm_batch(const bignum256modm a)438 iszero256_modm_batch(const bignum256modm a) {
439 	size_t i;
440 	for (i = 0; i < 9; i++)
441 		if (a[i])
442 			return 0;
443 	return 1;
444 }
445 
446 /* is a == 1 */
447 static int
isone256_modm_batch(const bignum256modm a)448 isone256_modm_batch(const bignum256modm a) {
449 	size_t i;
450 	if (a[0] != 1)
451 		return 0;
452 	for (i = 1; i < 9; i++)
453 		if (a[i])
454 			return 0;
455 	return 1;
456 }
457 
458 /* can a fit in to (at most) 128 bits */
459 static int
isatmost128bits256_modm_batch(const bignum256modm a)460 isatmost128bits256_modm_batch(const bignum256modm a) {
461 	uint32_t mask =
462 		((a[8]             )  | /*  16 */
463 		 (a[7]             )  | /*  46 */
464 		 (a[6]             )  | /*  76 */
465 		 (a[5]             )  | /* 106 */
466 		 (a[4] & 0x3fffff00));  /* 128 */
467 
468 	return (mask == 0);
469 }
470