xref: /linux/arch/x86/crypto/curve25519-x86_64.c (revision 9a6b55ac)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3  * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
4  * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5  * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
6  */
7 
8 #include <crypto/curve25519.h>
9 #include <crypto/internal/kpp.h>
10 
11 #include <linux/types.h>
12 #include <linux/jump_label.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 
16 #include <asm/cpufeature.h>
17 #include <asm/processor.h>
18 
19 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2);
20 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx);
21 
22 enum { NUM_WORDS_ELTFP25519 = 4 };
23 typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
24 typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
25 
26 #define mul_eltfp25519_1w_adx(c, a, b) do { \
27 	mul_256x256_integer_adx(m.buffer, a, b); \
28 	red_eltfp25519_1w_adx(c, m.buffer); \
29 } while (0)
30 
31 #define mul_eltfp25519_1w_bmi2(c, a, b) do { \
32 	mul_256x256_integer_bmi2(m.buffer, a, b); \
33 	red_eltfp25519_1w_bmi2(c, m.buffer); \
34 } while (0)
35 
36 #define sqr_eltfp25519_1w_adx(a) do { \
37 	sqr_256x256_integer_adx(m.buffer, a); \
38 	red_eltfp25519_1w_adx(a, m.buffer); \
39 } while (0)
40 
41 #define sqr_eltfp25519_1w_bmi2(a) do { \
42 	sqr_256x256_integer_bmi2(m.buffer, a); \
43 	red_eltfp25519_1w_bmi2(a, m.buffer); \
44 } while (0)
45 
46 #define mul_eltfp25519_2w_adx(c, a, b) do { \
47 	mul2_256x256_integer_adx(m.buffer, a, b); \
48 	red_eltfp25519_2w_adx(c, m.buffer); \
49 } while (0)
50 
51 #define mul_eltfp25519_2w_bmi2(c, a, b) do { \
52 	mul2_256x256_integer_bmi2(m.buffer, a, b); \
53 	red_eltfp25519_2w_bmi2(c, m.buffer); \
54 } while (0)
55 
56 #define sqr_eltfp25519_2w_adx(a) do { \
57 	sqr2_256x256_integer_adx(m.buffer, a); \
58 	red_eltfp25519_2w_adx(a, m.buffer); \
59 } while (0)
60 
61 #define sqr_eltfp25519_2w_bmi2(a) do { \
62 	sqr2_256x256_integer_bmi2(m.buffer, a); \
63 	red_eltfp25519_2w_bmi2(a, m.buffer); \
64 } while (0)
65 
66 #define sqrn_eltfp25519_1w_adx(a, times) do { \
67 	int ____counter = (times); \
68 	while (____counter-- > 0) \
69 		sqr_eltfp25519_1w_adx(a); \
70 } while (0)
71 
72 #define sqrn_eltfp25519_1w_bmi2(a, times) do { \
73 	int ____counter = (times); \
74 	while (____counter-- > 0) \
75 		sqr_eltfp25519_1w_bmi2(a); \
76 } while (0)
77 
78 #define copy_eltfp25519_1w(C, A) do { \
79 	(C)[0] = (A)[0]; \
80 	(C)[1] = (A)[1]; \
81 	(C)[2] = (A)[2]; \
82 	(C)[3] = (A)[3]; \
83 } while (0)
84 
85 #define setzero_eltfp25519_1w(C) do { \
86 	(C)[0] = 0; \
87 	(C)[1] = 0; \
88 	(C)[2] = 0; \
89 	(C)[3] = 0; \
90 } while (0)
91 
92 __aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
93 	/*   1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
94 		  0xffffffffffffffffUL, 0x5fffffffffffffffUL,
95 	/*   2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
96 		  0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
97 	/*   3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
98 		  0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
99 	/*   4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
100 		  0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
101 	/*   5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
102 		  0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
103 	/*   6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
104 		  0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
105 	/*   7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
106 		  0xc1c20d06231f7614UL, 0x2938218da274f972UL,
107 	/*   8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
108 		  0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
109 	/*   9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
110 		  0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
111 	/*  10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
112 		  0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
113 	/*  11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
114 		  0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
115 	/*  12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
116 		  0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
117 	/*  13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
118 		  0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
119 	/*  14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
120 		  0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
121 	/*  15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
122 		  0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
123 	/*  16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
124 		  0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
125 	/*  17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
126 		  0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
127 	/*  18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
128 		  0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
129 	/*  19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
130 		  0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
131 	/*  20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
132 		  0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
133 	/*  21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
134 		  0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
135 	/*  22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
136 		  0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
137 	/*  23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
138 		  0x23758739f630a257UL, 0x295a407a01a78580UL,
139 	/*  24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
140 		  0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
141 	/*  25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
142 		  0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
143 	/*  26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
144 		  0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
145 	/*  27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
146 		  0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
147 	/*  28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
148 		  0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
149 	/*  29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
150 		  0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
151 	/*  30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
152 		  0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
153 	/*  31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
154 		  0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
155 	/*  32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
156 		  0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
157 	/*  33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
158 		  0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
159 	/*  34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
160 		  0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
161 	/*  35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
162 		  0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
163 	/*  36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
164 		  0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
165 	/*  37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
166 		  0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
167 	/*  38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
168 		  0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
169 	/*  39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
170 		  0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
171 	/*  40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
172 		  0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
173 	/*  41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
174 		  0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
175 	/*  42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
176 		  0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
177 	/*  43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
178 		  0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
179 	/*  44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
180 		  0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
181 	/*  45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
182 		  0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
183 	/*  46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
184 		  0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
185 	/*  47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
186 		  0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
187 	/*  48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
188 		  0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
189 	/*  49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
190 		  0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
191 	/*  50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
192 		  0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
193 	/*  51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
194 		  0xc189218075e91436UL, 0x6d9284169b3b8484UL,
195 	/*  52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
196 		  0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
197 	/*  53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
198 		  0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
199 	/*  54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
200 		  0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
201 	/*  55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
202 		  0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
203 	/*  56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
204 		  0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
205 	/*  57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
206 		  0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
207 	/*  58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
208 		  0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
209 	/*  59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
210 		  0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
211 	/*  60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
212 		  0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
213 	/*  61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
214 		  0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
215 	/*  62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
216 		  0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
217 	/*  63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
218 		  0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
219 	/*  64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
220 		  0x25232973322dbef4UL, 0x445dc4758c17f770UL,
221 	/*  65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
222 		  0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
223 	/*  66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
224 		  0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
225 	/*  67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
226 		  0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
227 	/*  68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
228 		  0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
229 	/*  69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
230 		  0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
231 	/*  70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
232 		  0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
233 	/*  71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
234 		  0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
235 	/*  72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
236 		  0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
237 	/*  73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
238 		  0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
239 	/*  74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
240 		  0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
241 	/*  75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
242 		  0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
243 	/*  76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
244 		  0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
245 	/*  77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
246 		  0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
247 	/*  78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
248 		  0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
249 	/*  79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
250 		  0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
251 	/*  80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
252 		  0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
253 	/*  81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
254 		  0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
255 	/*  82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
256 		  0x894d1d855ae52359UL, 0x68e122157b743d69UL,
257 	/*  83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
258 		  0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
259 	/*  84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
260 		  0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
261 	/*  85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
262 		  0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
263 	/*  86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
264 		  0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
265 	/*  87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
266 		  0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
267 	/*  88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
268 		  0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
269 	/*  89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
270 		  0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
271 	/*  90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
272 		  0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
273 	/*  91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
274 		  0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
275 	/*  92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
276 		  0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
277 	/*  93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
278 		  0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
279 	/*  94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
280 		  0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
281 	/*  95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
282 		  0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
283 	/*  96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
284 		  0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
285 	/*  97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
286 		  0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
287 	/*  98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
288 		  0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
289 	/*  99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
290 		  0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
291 	/* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
292 		  0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
293 	/* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
294 		  0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
295 	/* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
296 		  0x4a497962066e6043UL, 0x705b3aab41355b44UL,
297 	/* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
298 		  0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
299 	/* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
300 		  0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
301 	/* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
302 		  0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
303 	/* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
304 		  0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
305 	/* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
306 		  0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
307 	/* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
308 		  0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
309 	/* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
310 		  0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
311 	/* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
312 		  0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
313 	/* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
314 		  0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
315 	/* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
316 		  0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
317 	/* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
318 		  0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
319 	/* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
320 		  0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
321 	/* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
322 		  0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
323 	/* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
324 		  0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
325 	/* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
326 		  0x508e862f121692fcUL, 0x3a81907fa093c291UL,
327 	/* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
328 		  0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
329 	/* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
330 		  0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
331 	/* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
332 		  0xe488de11d761e352UL, 0x0e878a01a085545cUL,
333 	/* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
334 		  0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
335 	/* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
336 		  0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
337 	/* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
338 		  0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
339 	/* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
340 		  0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
341 	/* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
342 		  0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
343 	/* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
344 		  0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
345 	/* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
346 		  0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
347 	/* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
348 		  0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
349 	/* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
350 		  0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
351 	/* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
352 		  0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
353 	/* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
354 		  0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
355 	/* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
356 		  0x266fd5809208f294UL, 0x5c847085619a26b9UL,
357 	/* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
358 		  0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
359 	/* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
360 		  0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
361 	/* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
362 		  0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
363 	/* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
364 		  0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
365 	/* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
366 		  0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
367 	/* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
368 		  0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
369 	/* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
370 		  0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
371 	/* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
372 		  0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
373 	/* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
374 		  0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
375 	/* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
376 		  0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
377 	/* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
378 		  0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
379 	/* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
380 		  0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
381 	/* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
382 		  0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
383 	/* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
384 		  0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
385 	/* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
386 		  0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
387 	/* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
388 		  0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
389 	/* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
390 		  0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
391 	/* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
392 		  0x52d17436309d4253UL, 0x356f97e13efae576UL,
393 	/* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
394 		  0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
395 	/* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
396 		  0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
397 	/* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
398 		  0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
399 	/* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
400 		  0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
401 	/* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
402 		  0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
403 	/* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
404 		  0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
405 	/* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
406 		  0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
407 	/* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
408 		  0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
409 	/* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
410 		  0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
411 	/* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
412 		  0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
413 	/* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
414 		  0x497d723f802e88e1UL, 0x30684dea602f408dUL,
415 	/* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
416 		  0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
417 	/* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
418 		  0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
419 	/* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
420 		  0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
421 	/* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
422 		  0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
423 	/* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
424 		  0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
425 	/* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
426 		  0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
427 	/* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
428 		  0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
429 	/* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
430 		  0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
431 	/* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
432 		  0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
433 	/* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
434 		  0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
435 	/* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
436 		  0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
437 	/* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
438 		  0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
439 	/* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
440 		  0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
441 	/* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
442 		  0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
443 	/* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
444 		  0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
445 	/* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
446 		  0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
447 	/* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
448 		  0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
449 	/* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
450 		  0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
451 	/* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
452 		  0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
453 	/* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
454 		  0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
455 	/* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
456 		  0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
457 	/* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
458 		  0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
459 	/* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
460 		  0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
461 	/* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
462 		  0x81004b71e33cc191UL, 0x44e6be345122803cUL,
463 	/* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
464 		  0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
465 	/* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
466 		  0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
467 	/* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
468 		  0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
469 	/* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
470 		  0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
471 	/* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
472 		  0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
473 	/* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
474 		  0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
475 	/* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
476 		  0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
477 	/* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
478 		  0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
479 	/* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
480 		  0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
481 	/* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
482 		  0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
483 	/* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
484 		  0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
485 	/* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
486 		  0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
487 	/* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
488 		  0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
489 	/* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
490 		  0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
491 	/* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
492 		  0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
493 	/* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
494 		  0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
495 	/* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
496 		  0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
497 	/* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
498 		  0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
499 	/* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
500 		  0x33979624f0e917beUL, 0x2c018dc527356b30UL,
501 	/* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
502 		  0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
503 	/* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
504 		  0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
505 	/* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
506 		  0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
507 	/* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
508 		  0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
509 	/* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
510 		  0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
511 	/* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
512 		  0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
513 	/* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
514 		  0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
515 	/* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
516 		  0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
517 	/* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
518 		  0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
519 	/* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
520 		  0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
521 	/* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
522 		  0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
523 	/* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
524 		  0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
525 	/* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
526 		  0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
527 	/* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
528 		  0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
529 	/* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
530 		  0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
531 	/* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
532 		  0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
533 	/* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
534 		  0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
535 	/* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
536 		  0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
537 	/* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
538 		  0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
539 	/* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
540 		  0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
541 	/* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
542 		  0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
543 	/* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
544 		  0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
545 	/* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
546 		  0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
547 	/* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
548 		  0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
549 	/* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
550 		  0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
551 	/* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
552 		  0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
553 	/* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
554 		  0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
555 	/* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
556 		  0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
557 	/* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
558 		  0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
559 	/* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
560 		  0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
561 	/* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
562 		  0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
563 	/* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
564 		  0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
565 	/* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
566 		  0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
567 	/* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
568 		  0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
569 	/* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
570 		  0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
571 	/* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
572 		  0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
573 	/* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
574 		  0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
575 	/* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
576 		  0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
577 	/* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
578 		  0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
579 	/* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
580 		  0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
581 	/* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
582 		  0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
583 	/* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
584 		  0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
585 	/* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
586 		  0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
587 	/* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
588 		  0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
589 	/* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
590 		  0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
591 	/* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
592 		  0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
593 	/* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
594 		  0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
595 	/* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
596 		  0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
597 };
598 
599 /* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
600  * a is two 256-bit integers: a0[0:3] and a1[4:7]
601  * b is two 256-bit integers: b0[0:3] and b1[4:7]
602  */
603 static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
604 				     const u64 *const b)
605 {
606 	asm volatile(
607 		"xorl %%r14d, %%r14d ;"
608 		"movq   (%1), %%rdx; "	/* A[0] */
609 		"mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
610 		"xorl %%r10d, %%r10d ;"
611 		"movq %%r8, (%0) ;"
612 		"mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
613 		"adox %%r10, %%r15 ;"
614 		"mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
615 		"adox  %%r8, %%rax ;"
616 		"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
617 		"adox %%r10, %%rbx ;"
618 		/******************************************/
619 		"adox %%r14, %%rcx ;"
620 
621 		"movq  8(%1), %%rdx; "	/* A[1] */
622 		"mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
623 		"adox %%r15,  %%r8 ;"
624 		"movq  %%r8, 8(%0) ;"
625 		"mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
626 		"adox %%r10,  %%r9 ;"
627 		"adcx  %%r9, %%rax ;"
628 		"mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
629 		"adox  %%r8, %%r11 ;"
630 		"adcx %%r11, %%rbx ;"
631 		"mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
632 		"adox %%r10, %%r13 ;"
633 		"adcx %%r13, %%rcx ;"
634 		/******************************************/
635 		"adox %%r14, %%r15 ;"
636 		"adcx %%r14, %%r15 ;"
637 
638 		"movq 16(%1), %%rdx; " /* A[2] */
639 		"xorl %%r10d, %%r10d ;"
640 		"mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
641 		"adox %%rax,  %%r8 ;"
642 		"movq %%r8, 16(%0) ;"
643 		"mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
644 		"adox %%r10,  %%r9 ;"
645 		"adcx  %%r9, %%rbx ;"
646 		"mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
647 		"adox  %%r8, %%r11 ;"
648 		"adcx %%r11, %%rcx ;"
649 		"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
650 		"adox %%r10, %%r13 ;"
651 		"adcx %%r13, %%r15 ;"
652 		/******************************************/
653 		"adox %%r14, %%rax ;"
654 		"adcx %%r14, %%rax ;"
655 
656 		"movq 24(%1), %%rdx; " /* A[3] */
657 		"xorl %%r10d, %%r10d ;"
658 		"mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
659 		"adox %%rbx,  %%r8 ;"
660 		"movq %%r8, 24(%0) ;"
661 		"mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
662 		"adox %%r10,  %%r9 ;"
663 		"adcx  %%r9, %%rcx ;"
664 		"movq %%rcx, 32(%0) ;"
665 		"mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
666 		"adox  %%r8, %%r11 ;"
667 		"adcx %%r11, %%r15 ;"
668 		"movq %%r15, 40(%0) ;"
669 		"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
670 		"adox %%r10, %%r13 ;"
671 		"adcx %%r13, %%rax ;"
672 		"movq %%rax, 48(%0) ;"
673 		/******************************************/
674 		"adox %%r14, %%rbx ;"
675 		"adcx %%r14, %%rbx ;"
676 		"movq %%rbx, 56(%0) ;"
677 
678 		"movq 32(%1), %%rdx; "	/* C[0] */
679 		"mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
680 		"xorl %%r10d, %%r10d ;"
681 		"movq %%r8, 64(%0);"
682 		"mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
683 		"adox %%r10, %%r15 ;"
684 		"mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
685 		"adox  %%r8, %%rax ;"
686 		"mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
687 		"adox %%r10, %%rbx ;"
688 		/******************************************/
689 		"adox %%r14, %%rcx ;"
690 
691 		"movq 40(%1), %%rdx; " /* C[1] */
692 		"xorl %%r10d, %%r10d ;"
693 		"mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
694 		"adox %%r15,  %%r8 ;"
695 		"movq  %%r8, 72(%0);"
696 		"mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
697 		"adox %%r10,  %%r9 ;"
698 		"adcx  %%r9, %%rax ;"
699 		"mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
700 		"adox  %%r8, %%r11 ;"
701 		"adcx %%r11, %%rbx ;"
702 		"mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
703 		"adox %%r10, %%r13 ;"
704 		"adcx %%r13, %%rcx ;"
705 		/******************************************/
706 		"adox %%r14, %%r15 ;"
707 		"adcx %%r14, %%r15 ;"
708 
709 		"movq 48(%1), %%rdx; " /* C[2] */
710 		"xorl %%r10d, %%r10d ;"
711 		"mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
712 		"adox %%rax,  %%r8 ;"
713 		"movq  %%r8, 80(%0);"
714 		"mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
715 		"adox %%r10,  %%r9 ;"
716 		"adcx  %%r9, %%rbx ;"
717 		"mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
718 		"adox  %%r8, %%r11 ;"
719 		"adcx %%r11, %%rcx ;"
720 		"mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
721 		"adox %%r10, %%r13 ;"
722 		"adcx %%r13, %%r15 ;"
723 		/******************************************/
724 		"adox %%r14, %%rax ;"
725 		"adcx %%r14, %%rax ;"
726 
727 		"movq 56(%1), %%rdx; " /* C[3] */
728 		"xorl %%r10d, %%r10d ;"
729 		"mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
730 		"adox %%rbx,  %%r8 ;"
731 		"movq  %%r8, 88(%0);"
732 		"mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
733 		"adox %%r10,  %%r9 ;"
734 		"adcx  %%r9, %%rcx ;"
735 		"movq %%rcx,  96(%0) ;"
736 		"mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
737 		"adox  %%r8, %%r11 ;"
738 		"adcx %%r11, %%r15 ;"
739 		"movq %%r15, 104(%0) ;"
740 		"mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
741 		"adox %%r10, %%r13 ;"
742 		"adcx %%r13, %%rax ;"
743 		"movq %%rax, 112(%0) ;"
744 		/******************************************/
745 		"adox %%r14, %%rbx ;"
746 		"adcx %%r14, %%rbx ;"
747 		"movq %%rbx, 120(%0) ;"
748 		:
749 		: "r"(c), "r"(a), "r"(b)
750 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
751 		  "%r10", "%r11", "%r13", "%r14", "%r15");
752 }
753 
754 static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
755 				      const u64 *const b)
756 {
757 	asm volatile(
758 		"movq   (%1), %%rdx; "	/* A[0] */
759 		"mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
760 		"movq %%r8,  (%0) ;"
761 		"mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
762 		"addq %%r10, %%r15 ;"
763 		"mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
764 		"adcq  %%r8, %%rax ;"
765 		"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
766 		"adcq %%r10, %%rbx ;"
767 		/******************************************/
768 		"adcq    $0, %%rcx ;"
769 
770 		"movq  8(%1), %%rdx; "	/* A[1] */
771 		"mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
772 		"addq %%r15,  %%r8 ;"
773 		"movq %%r8, 8(%0) ;"
774 		"mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
775 		"adcq %%r10,  %%r9 ;"
776 		"mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
777 		"adcq  %%r8, %%r11 ;"
778 		"mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
779 		"adcq %%r10, %%r13 ;"
780 		/******************************************/
781 		"adcq    $0, %%r15 ;"
782 
783 		"addq  %%r9, %%rax ;"
784 		"adcq %%r11, %%rbx ;"
785 		"adcq %%r13, %%rcx ;"
786 		"adcq    $0, %%r15 ;"
787 
788 		"movq 16(%1), %%rdx; "	/* A[2] */
789 		"mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
790 		"addq %%rax,  %%r8 ;"
791 		"movq %%r8, 16(%0) ;"
792 		"mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
793 		"adcq %%r10,  %%r9 ;"
794 		"mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
795 		"adcq  %%r8, %%r11 ;"
796 		"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
797 		"adcq %%r10, %%r13 ;"
798 		/******************************************/
799 		"adcq    $0, %%rax ;"
800 
801 		"addq  %%r9, %%rbx ;"
802 		"adcq %%r11, %%rcx ;"
803 		"adcq %%r13, %%r15 ;"
804 		"adcq    $0, %%rax ;"
805 
806 		"movq 24(%1), %%rdx; "	/* A[3] */
807 		"mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
808 		"addq %%rbx,  %%r8 ;"
809 		"movq %%r8, 24(%0) ;"
810 		"mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
811 		"adcq %%r10,  %%r9 ;"
812 		"mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
813 		"adcq  %%r8, %%r11 ;"
814 		"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
815 		"adcq %%r10, %%r13 ;"
816 		/******************************************/
817 		"adcq    $0, %%rbx ;"
818 
819 		"addq  %%r9, %%rcx ;"
820 		"movq %%rcx, 32(%0) ;"
821 		"adcq %%r11, %%r15 ;"
822 		"movq %%r15, 40(%0) ;"
823 		"adcq %%r13, %%rax ;"
824 		"movq %%rax, 48(%0) ;"
825 		"adcq    $0, %%rbx ;"
826 		"movq %%rbx, 56(%0) ;"
827 
828 		"movq 32(%1), %%rdx; "	/* C[0] */
829 		"mulx 32(%2),  %%r8, %%r15; " /* C[0]*D[0] */
830 		"movq %%r8, 64(%0) ;"
831 		"mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
832 		"addq %%r10, %%r15 ;"
833 		"mulx 48(%2),  %%r8, %%rbx; " /* C[0]*D[2] */
834 		"adcq  %%r8, %%rax ;"
835 		"mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
836 		"adcq %%r10, %%rbx ;"
837 		/******************************************/
838 		"adcq    $0, %%rcx ;"
839 
840 		"movq 40(%1), %%rdx; "	/* C[1] */
841 		"mulx 32(%2),  %%r8,  %%r9; " /* C[1]*D[0] */
842 		"addq %%r15,  %%r8 ;"
843 		"movq %%r8, 72(%0) ;"
844 		"mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
845 		"adcq %%r10,  %%r9 ;"
846 		"mulx 48(%2),  %%r8, %%r13; " /* C[1]*D[2] */
847 		"adcq  %%r8, %%r11 ;"
848 		"mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
849 		"adcq %%r10, %%r13 ;"
850 		/******************************************/
851 		"adcq    $0, %%r15 ;"
852 
853 		"addq  %%r9, %%rax ;"
854 		"adcq %%r11, %%rbx ;"
855 		"adcq %%r13, %%rcx ;"
856 		"adcq    $0, %%r15 ;"
857 
858 		"movq 48(%1), %%rdx; "	/* C[2] */
859 		"mulx 32(%2),  %%r8,  %%r9; " /* C[2]*D[0] */
860 		"addq %%rax,  %%r8 ;"
861 		"movq %%r8, 80(%0) ;"
862 		"mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
863 		"adcq %%r10,  %%r9 ;"
864 		"mulx 48(%2),  %%r8, %%r13; " /* C[2]*D[2] */
865 		"adcq  %%r8, %%r11 ;"
866 		"mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
867 		"adcq %%r10, %%r13 ;"
868 		/******************************************/
869 		"adcq    $0, %%rax ;"
870 
871 		"addq  %%r9, %%rbx ;"
872 		"adcq %%r11, %%rcx ;"
873 		"adcq %%r13, %%r15 ;"
874 		"adcq    $0, %%rax ;"
875 
876 		"movq 56(%1), %%rdx; "	/* C[3] */
877 		"mulx 32(%2),  %%r8,  %%r9; " /* C[3]*D[0] */
878 		"addq %%rbx,  %%r8 ;"
879 		"movq %%r8, 88(%0) ;"
880 		"mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
881 		"adcq %%r10,  %%r9 ;"
882 		"mulx 48(%2),  %%r8, %%r13; " /* C[3]*D[2] */
883 		"adcq  %%r8, %%r11 ;"
884 		"mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
885 		"adcq %%r10, %%r13 ;"
886 		/******************************************/
887 		"adcq    $0, %%rbx ;"
888 
889 		"addq  %%r9, %%rcx ;"
890 		"movq %%rcx,  96(%0) ;"
891 		"adcq %%r11, %%r15 ;"
892 		"movq %%r15, 104(%0) ;"
893 		"adcq %%r13, %%rax ;"
894 		"movq %%rax, 112(%0) ;"
895 		"adcq    $0, %%rbx ;"
896 		"movq %%rbx, 120(%0) ;"
897 		:
898 		: "r"(c), "r"(a), "r"(b)
899 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
900 		  "%r10", "%r11", "%r13", "%r15");
901 }
902 
903 static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
904 {
905 	asm volatile(
906 		"movq   (%1), %%rdx        ;" /* A[0]      */
907 		"mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
908 		"xorl %%r15d, %%r15d;"
909 		"mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
910 		"adcx %%r14,  %%r9 ;"
911 		"mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
912 		"adcx %%rax, %%r10 ;"
913 		"movq 24(%1), %%rdx        ;" /* A[3]      */
914 		"mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
915 		"adcx %%rcx, %%r11 ;"
916 		"mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
917 		"adcx %%rax, %%rbx ;"
918 		"movq  8(%1), %%rdx        ;" /* A[1]      */
919 		"adcx %%r15, %%r13 ;"
920 		"mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
921 		"movq    $0, %%r14 ;"
922 		/******************************************/
923 		"adcx %%r15, %%r14 ;"
924 
925 		"xorl %%r15d, %%r15d;"
926 		"adox %%rax, %%r10 ;"
927 		"adcx  %%r8,  %%r8 ;"
928 		"adox %%rcx, %%r11 ;"
929 		"adcx  %%r9,  %%r9 ;"
930 		"adox %%r15, %%rbx ;"
931 		"adcx %%r10, %%r10 ;"
932 		"adox %%r15, %%r13 ;"
933 		"adcx %%r11, %%r11 ;"
934 		"adox %%r15, %%r14 ;"
935 		"adcx %%rbx, %%rbx ;"
936 		"adcx %%r13, %%r13 ;"
937 		"adcx %%r14, %%r14 ;"
938 
939 		"movq   (%1), %%rdx ;"
940 		"mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
941 		/*******************/
942 		"movq %%rax,  0(%0) ;"
943 		"addq %%rcx,  %%r8 ;"
944 		"movq  %%r8,  8(%0) ;"
945 		"movq  8(%1), %%rdx ;"
946 		"mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
947 		"adcq %%rax,  %%r9 ;"
948 		"movq  %%r9, 16(%0) ;"
949 		"adcq %%rcx, %%r10 ;"
950 		"movq %%r10, 24(%0) ;"
951 		"movq 16(%1), %%rdx ;"
952 		"mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
953 		"adcq %%rax, %%r11 ;"
954 		"movq %%r11, 32(%0) ;"
955 		"adcq %%rcx, %%rbx ;"
956 		"movq %%rbx, 40(%0) ;"
957 		"movq 24(%1), %%rdx ;"
958 		"mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
959 		"adcq %%rax, %%r13 ;"
960 		"movq %%r13, 48(%0) ;"
961 		"adcq %%rcx, %%r14 ;"
962 		"movq %%r14, 56(%0) ;"
963 
964 
965 		"movq 32(%1), %%rdx        ;" /* B[0]      */
966 		"mulx 40(%1),  %%r8, %%r14 ;" /* B[1]*B[0] */
967 		"xorl %%r15d, %%r15d;"
968 		"mulx 48(%1),  %%r9, %%r10 ;" /* B[2]*B[0] */
969 		"adcx %%r14,  %%r9 ;"
970 		"mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
971 		"adcx %%rax, %%r10 ;"
972 		"movq 56(%1), %%rdx        ;" /* B[3]      */
973 		"mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
974 		"adcx %%rcx, %%r11 ;"
975 		"mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
976 		"adcx %%rax, %%rbx ;"
977 		"movq 40(%1), %%rdx        ;" /* B[1]      */
978 		"adcx %%r15, %%r13 ;"
979 		"mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
980 		"movq    $0, %%r14 ;"
981 		/******************************************/
982 		"adcx %%r15, %%r14 ;"
983 
984 		"xorl %%r15d, %%r15d;"
985 		"adox %%rax, %%r10 ;"
986 		"adcx  %%r8,  %%r8 ;"
987 		"adox %%rcx, %%r11 ;"
988 		"adcx  %%r9,  %%r9 ;"
989 		"adox %%r15, %%rbx ;"
990 		"adcx %%r10, %%r10 ;"
991 		"adox %%r15, %%r13 ;"
992 		"adcx %%r11, %%r11 ;"
993 		"adox %%r15, %%r14 ;"
994 		"adcx %%rbx, %%rbx ;"
995 		"adcx %%r13, %%r13 ;"
996 		"adcx %%r14, %%r14 ;"
997 
998 		"movq 32(%1), %%rdx ;"
999 		"mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
1000 		/*******************/
1001 		"movq %%rax,  64(%0) ;"
1002 		"addq %%rcx,  %%r8 ;"
1003 		"movq  %%r8,  72(%0) ;"
1004 		"movq 40(%1), %%rdx ;"
1005 		"mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
1006 		"adcq %%rax,  %%r9 ;"
1007 		"movq  %%r9,  80(%0) ;"
1008 		"adcq %%rcx, %%r10 ;"
1009 		"movq %%r10,  88(%0) ;"
1010 		"movq 48(%1), %%rdx ;"
1011 		"mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
1012 		"adcq %%rax, %%r11 ;"
1013 		"movq %%r11,  96(%0) ;"
1014 		"adcq %%rcx, %%rbx ;"
1015 		"movq %%rbx, 104(%0) ;"
1016 		"movq 56(%1), %%rdx ;"
1017 		"mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
1018 		"adcq %%rax, %%r13 ;"
1019 		"movq %%r13, 112(%0) ;"
1020 		"adcq %%rcx, %%r14 ;"
1021 		"movq %%r14, 120(%0) ;"
1022 		:
1023 		: "r"(c), "r"(a)
1024 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1025 		  "%r10", "%r11", "%r13", "%r14", "%r15");
1026 }
1027 
1028 static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1029 {
1030 	asm volatile(
1031 		"movq  8(%1), %%rdx        ;" /* A[1]      */
1032 		"mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
1033 		"mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1034 		"mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1035 
1036 		"movq 16(%1), %%rdx        ;" /* A[2]      */
1037 		"mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1038 		"mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1039 
1040 		"addq %%rax,  %%r9 ;"
1041 		"adcq %%rdx, %%r10 ;"
1042 		"adcq %%rcx, %%r11 ;"
1043 		"adcq %%r14, %%r15 ;"
1044 		"adcq    $0, %%r13 ;"
1045 		"movq    $0, %%r14 ;"
1046 		"adcq    $0, %%r14 ;"
1047 
1048 		"movq   (%1), %%rdx        ;" /* A[0]      */
1049 		"mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1050 
1051 		"addq %%rax, %%r10 ;"
1052 		"adcq %%rcx, %%r11 ;"
1053 		"adcq    $0, %%r15 ;"
1054 		"adcq    $0, %%r13 ;"
1055 		"adcq    $0, %%r14 ;"
1056 
1057 		"shldq $1, %%r13, %%r14 ;"
1058 		"shldq $1, %%r15, %%r13 ;"
1059 		"shldq $1, %%r11, %%r15 ;"
1060 		"shldq $1, %%r10, %%r11 ;"
1061 		"shldq $1,  %%r9, %%r10 ;"
1062 		"shldq $1,  %%r8,  %%r9 ;"
1063 		"shlq  $1,  %%r8        ;"
1064 
1065 		/*******************/
1066 		"mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
1067 		/*******************/
1068 		"movq %%rax,  0(%0) ;"
1069 		"addq %%rcx,  %%r8 ;"
1070 		"movq  %%r8,  8(%0) ;"
1071 		"movq  8(%1), %%rdx ;"
1072 		"mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
1073 		"adcq %%rax,  %%r9 ;"
1074 		"movq  %%r9, 16(%0) ;"
1075 		"adcq %%rcx, %%r10 ;"
1076 		"movq %%r10, 24(%0) ;"
1077 		"movq 16(%1), %%rdx ;"
1078 		"mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
1079 		"adcq %%rax, %%r11 ;"
1080 		"movq %%r11, 32(%0) ;"
1081 		"adcq %%rcx, %%r15 ;"
1082 		"movq %%r15, 40(%0) ;"
1083 		"movq 24(%1), %%rdx ;"
1084 		"mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
1085 		"adcq %%rax, %%r13 ;"
1086 		"movq %%r13, 48(%0) ;"
1087 		"adcq %%rcx, %%r14 ;"
1088 		"movq %%r14, 56(%0) ;"
1089 
1090 		"movq 40(%1), %%rdx        ;" /* B[1]      */
1091 		"mulx 32(%1),  %%r8,  %%r9 ;" /* B[0]*B[1] */
1092 		"mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
1093 		"mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
1094 
1095 		"movq 48(%1), %%rdx        ;" /* B[2]      */
1096 		"mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
1097 		"mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
1098 
1099 		"addq %%rax,  %%r9 ;"
1100 		"adcq %%rdx, %%r10 ;"
1101 		"adcq %%rcx, %%r11 ;"
1102 		"adcq %%r14, %%r15 ;"
1103 		"adcq    $0, %%r13 ;"
1104 		"movq    $0, %%r14 ;"
1105 		"adcq    $0, %%r14 ;"
1106 
1107 		"movq 32(%1), %%rdx        ;" /* B[0]      */
1108 		"mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
1109 
1110 		"addq %%rax, %%r10 ;"
1111 		"adcq %%rcx, %%r11 ;"
1112 		"adcq    $0, %%r15 ;"
1113 		"adcq    $0, %%r13 ;"
1114 		"adcq    $0, %%r14 ;"
1115 
1116 		"shldq $1, %%r13, %%r14 ;"
1117 		"shldq $1, %%r15, %%r13 ;"
1118 		"shldq $1, %%r11, %%r15 ;"
1119 		"shldq $1, %%r10, %%r11 ;"
1120 		"shldq $1,  %%r9, %%r10 ;"
1121 		"shldq $1,  %%r8,  %%r9 ;"
1122 		"shlq  $1,  %%r8        ;"
1123 
1124 		/*******************/
1125 		"mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
1126 		/*******************/
1127 		"movq %%rax,  64(%0) ;"
1128 		"addq %%rcx,  %%r8 ;"
1129 		"movq  %%r8,  72(%0) ;"
1130 		"movq 40(%1), %%rdx ;"
1131 		"mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
1132 		"adcq %%rax,  %%r9 ;"
1133 		"movq  %%r9,  80(%0) ;"
1134 		"adcq %%rcx, %%r10 ;"
1135 		"movq %%r10,  88(%0) ;"
1136 		"movq 48(%1), %%rdx ;"
1137 		"mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
1138 		"adcq %%rax, %%r11 ;"
1139 		"movq %%r11,  96(%0) ;"
1140 		"adcq %%rcx, %%r15 ;"
1141 		"movq %%r15, 104(%0) ;"
1142 		"movq 56(%1), %%rdx ;"
1143 		"mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
1144 		"adcq %%rax, %%r13 ;"
1145 		"movq %%r13, 112(%0) ;"
1146 		"adcq %%rcx, %%r14 ;"
1147 		"movq %%r14, 120(%0) ;"
1148 		:
1149 		: "r"(c), "r"(a)
1150 		: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1151 		  "%r11", "%r13", "%r14", "%r15");
1152 }
1153 
1154 static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
1155 {
1156 	asm volatile(
1157 		"movl    $38, %%edx; "	/* 2*c = 38 = 2^256 */
1158 		"mulx 32(%1),  %%r8, %%r10; " /* c*C[4] */
1159 		"xorl %%ebx, %%ebx ;"
1160 		"adox   (%1),  %%r8 ;"
1161 		"mulx 40(%1),  %%r9, %%r11; " /* c*C[5] */
1162 		"adcx %%r10,  %%r9 ;"
1163 		"adox  8(%1),  %%r9 ;"
1164 		"mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
1165 		"adcx %%r11, %%r10 ;"
1166 		"adox 16(%1), %%r10 ;"
1167 		"mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
1168 		"adcx %%rax, %%r11 ;"
1169 		"adox 24(%1), %%r11 ;"
1170 		/***************************************/
1171 		"adcx %%rbx, %%rcx ;"
1172 		"adox  %%rbx, %%rcx ;"
1173 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1174 		"adcx %%rcx,  %%r8 ;"
1175 		"adcx %%rbx,  %%r9 ;"
1176 		"movq  %%r9,  8(%0) ;"
1177 		"adcx %%rbx, %%r10 ;"
1178 		"movq %%r10, 16(%0) ;"
1179 		"adcx %%rbx, %%r11 ;"
1180 		"movq %%r11, 24(%0) ;"
1181 		"mov     $0, %%ecx ;"
1182 		"cmovc %%edx, %%ecx ;"
1183 		"addq %%rcx,  %%r8 ;"
1184 		"movq  %%r8,   (%0) ;"
1185 
1186 		"mulx  96(%1),  %%r8, %%r10; " /* c*C[4] */
1187 		"xorl %%ebx, %%ebx ;"
1188 		"adox 64(%1),  %%r8 ;"
1189 		"mulx 104(%1),  %%r9, %%r11; " /* c*C[5] */
1190 		"adcx %%r10,  %%r9 ;"
1191 		"adox 72(%1),  %%r9 ;"
1192 		"mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
1193 		"adcx %%r11, %%r10 ;"
1194 		"adox 80(%1), %%r10 ;"
1195 		"mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
1196 		"adcx %%rax, %%r11 ;"
1197 		"adox 88(%1), %%r11 ;"
1198 		/****************************************/
1199 		"adcx %%rbx, %%rcx ;"
1200 		"adox  %%rbx, %%rcx ;"
1201 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1202 		"adcx %%rcx,  %%r8 ;"
1203 		"adcx %%rbx,  %%r9 ;"
1204 		"movq  %%r9, 40(%0) ;"
1205 		"adcx %%rbx, %%r10 ;"
1206 		"movq %%r10, 48(%0) ;"
1207 		"adcx %%rbx, %%r11 ;"
1208 		"movq %%r11, 56(%0) ;"
1209 		"mov     $0, %%ecx ;"
1210 		"cmovc %%edx, %%ecx ;"
1211 		"addq %%rcx,  %%r8 ;"
1212 		"movq  %%r8, 32(%0) ;"
1213 		:
1214 		: "r"(c), "r"(a)
1215 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1216 		  "%r10", "%r11");
1217 }
1218 
1219 static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
1220 {
1221 	asm volatile(
1222 		"movl    $38, %%edx ; "       /* 2*c = 38 = 2^256 */
1223 		"mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1224 		"mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1225 		"addq %%r10,  %%r9 ;"
1226 		"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1227 		"adcq %%r11, %%r10 ;"
1228 		"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1229 		"adcq %%rax, %%r11 ;"
1230 		/***************************************/
1231 		"adcq    $0, %%rcx ;"
1232 		"addq   (%1),  %%r8 ;"
1233 		"adcq  8(%1),  %%r9 ;"
1234 		"adcq 16(%1), %%r10 ;"
1235 		"adcq 24(%1), %%r11 ;"
1236 		"adcq     $0, %%rcx ;"
1237 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1238 		"addq %%rcx,  %%r8 ;"
1239 		"adcq    $0,  %%r9 ;"
1240 		"movq  %%r9,  8(%0) ;"
1241 		"adcq    $0, %%r10 ;"
1242 		"movq %%r10, 16(%0) ;"
1243 		"adcq    $0, %%r11 ;"
1244 		"movq %%r11, 24(%0) ;"
1245 		"mov     $0, %%ecx ;"
1246 		"cmovc %%edx, %%ecx ;"
1247 		"addq %%rcx,  %%r8 ;"
1248 		"movq  %%r8,   (%0) ;"
1249 
1250 		"mulx  96(%1),  %%r8, %%r10 ;" /* c*C[4] */
1251 		"mulx 104(%1),  %%r9, %%r11 ;" /* c*C[5] */
1252 		"addq %%r10,  %%r9 ;"
1253 		"mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
1254 		"adcq %%r11, %%r10 ;"
1255 		"mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
1256 		"adcq %%rax, %%r11 ;"
1257 		/****************************************/
1258 		"adcq    $0, %%rcx ;"
1259 		"addq 64(%1),  %%r8 ;"
1260 		"adcq 72(%1),  %%r9 ;"
1261 		"adcq 80(%1), %%r10 ;"
1262 		"adcq 88(%1), %%r11 ;"
1263 		"adcq     $0, %%rcx ;"
1264 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1265 		"addq %%rcx,  %%r8 ;"
1266 		"adcq    $0,  %%r9 ;"
1267 		"movq  %%r9, 40(%0) ;"
1268 		"adcq    $0, %%r10 ;"
1269 		"movq %%r10, 48(%0) ;"
1270 		"adcq    $0, %%r11 ;"
1271 		"movq %%r11, 56(%0) ;"
1272 		"mov     $0, %%ecx ;"
1273 		"cmovc %%edx, %%ecx ;"
1274 		"addq %%rcx,  %%r8 ;"
1275 		"movq  %%r8, 32(%0) ;"
1276 		:
1277 		: "r"(c), "r"(a)
1278 		: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1279 		  "%r11");
1280 }
1281 
1282 static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
1283 				    const u64 *const b)
1284 {
1285 	asm volatile(
1286 		"movq   (%1), %%rdx; "	/* A[0] */
1287 		"mulx   (%2),  %%r8,  %%r9; " /* A[0]*B[0] */
1288 		"xorl %%r10d, %%r10d ;"
1289 		"movq  %%r8,  (%0) ;"
1290 		"mulx  8(%2), %%r10, %%r11; " /* A[0]*B[1] */
1291 		"adox  %%r9, %%r10 ;"
1292 		"movq %%r10, 8(%0) ;"
1293 		"mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
1294 		"adox %%r11, %%r15 ;"
1295 		"mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
1296 		"adox %%r13, %%r14 ;"
1297 		"movq $0, %%rax ;"
1298 		/******************************************/
1299 		"adox %%rdx, %%rax ;"
1300 
1301 		"movq  8(%1), %%rdx; "	/* A[1] */
1302 		"mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
1303 		"xorl %%r10d, %%r10d ;"
1304 		"adcx 8(%0),  %%r8 ;"
1305 		"movq  %%r8,  8(%0) ;"
1306 		"mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1307 		"adox  %%r9, %%r10 ;"
1308 		"adcx %%r15, %%r10 ;"
1309 		"movq %%r10, 16(%0) ;"
1310 		"mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
1311 		"adox %%r11, %%r15 ;"
1312 		"adcx %%r14, %%r15 ;"
1313 		"movq $0, %%r8  ;"
1314 		"mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
1315 		"adox %%r13, %%r14 ;"
1316 		"adcx %%rax, %%r14 ;"
1317 		"movq $0, %%rax ;"
1318 		/******************************************/
1319 		"adox %%rdx, %%rax ;"
1320 		"adcx  %%r8, %%rax ;"
1321 
1322 		"movq 16(%1), %%rdx; "	/* A[2] */
1323 		"mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
1324 		"xorl %%r10d, %%r10d ;"
1325 		"adcx 16(%0), %%r8 ;"
1326 		"movq  %%r8, 16(%0) ;"
1327 		"mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1328 		"adox  %%r9, %%r10 ;"
1329 		"adcx %%r15, %%r10 ;"
1330 		"movq %%r10, 24(%0) ;"
1331 		"mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
1332 		"adox %%r11, %%r15 ;"
1333 		"adcx %%r14, %%r15 ;"
1334 		"movq $0, %%r8  ;"
1335 		"mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
1336 		"adox %%r13, %%r14 ;"
1337 		"adcx %%rax, %%r14 ;"
1338 		"movq $0, %%rax ;"
1339 		/******************************************/
1340 		"adox %%rdx, %%rax ;"
1341 		"adcx  %%r8, %%rax ;"
1342 
1343 		"movq 24(%1), %%rdx; "	/* A[3] */
1344 		"mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
1345 		"xorl %%r10d, %%r10d ;"
1346 		"adcx 24(%0), %%r8 ;"
1347 		"movq  %%r8, 24(%0) ;"
1348 		"mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1349 		"adox  %%r9, %%r10 ;"
1350 		"adcx %%r15, %%r10 ;"
1351 		"movq %%r10, 32(%0) ;"
1352 		"mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
1353 		"adox %%r11, %%r15 ;"
1354 		"adcx %%r14, %%r15 ;"
1355 		"movq %%r15, 40(%0) ;"
1356 		"movq $0, %%r8  ;"
1357 		"mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
1358 		"adox %%r13, %%r14 ;"
1359 		"adcx %%rax, %%r14 ;"
1360 		"movq %%r14, 48(%0) ;"
1361 		"movq $0, %%rax ;"
1362 		/******************************************/
1363 		"adox %%rdx, %%rax ;"
1364 		"adcx  %%r8, %%rax ;"
1365 		"movq %%rax, 56(%0) ;"
1366 		:
1367 		: "r"(c), "r"(a), "r"(b)
1368 		: "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
1369 		  "%r13", "%r14", "%r15");
1370 }
1371 
1372 static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
1373 				     const u64 *const b)
1374 {
1375 	asm volatile(
1376 		"movq   (%1), %%rdx; "	/* A[0] */
1377 		"mulx   (%2),  %%r8, %%r15; " /* A[0]*B[0] */
1378 		"movq %%r8,  (%0) ;"
1379 		"mulx  8(%2), %%r10, %%rax; " /* A[0]*B[1] */
1380 		"addq %%r10, %%r15 ;"
1381 		"mulx 16(%2),  %%r8, %%rbx; " /* A[0]*B[2] */
1382 		"adcq  %%r8, %%rax ;"
1383 		"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
1384 		"adcq %%r10, %%rbx ;"
1385 		/******************************************/
1386 		"adcq    $0, %%rcx ;"
1387 
1388 		"movq  8(%1), %%rdx; "	/* A[1] */
1389 		"mulx   (%2),  %%r8,  %%r9; " /* A[1]*B[0] */
1390 		"addq %%r15,  %%r8 ;"
1391 		"movq %%r8, 8(%0) ;"
1392 		"mulx  8(%2), %%r10, %%r11; " /* A[1]*B[1] */
1393 		"adcq %%r10,  %%r9 ;"
1394 		"mulx 16(%2),  %%r8, %%r13; " /* A[1]*B[2] */
1395 		"adcq  %%r8, %%r11 ;"
1396 		"mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
1397 		"adcq %%r10, %%r13 ;"
1398 		/******************************************/
1399 		"adcq    $0, %%r15 ;"
1400 
1401 		"addq  %%r9, %%rax ;"
1402 		"adcq %%r11, %%rbx ;"
1403 		"adcq %%r13, %%rcx ;"
1404 		"adcq    $0, %%r15 ;"
1405 
1406 		"movq 16(%1), %%rdx; "	/* A[2] */
1407 		"mulx   (%2),  %%r8,  %%r9; " /* A[2]*B[0] */
1408 		"addq %%rax,  %%r8 ;"
1409 		"movq %%r8, 16(%0) ;"
1410 		"mulx  8(%2), %%r10, %%r11; " /* A[2]*B[1] */
1411 		"adcq %%r10,  %%r9 ;"
1412 		"mulx 16(%2),  %%r8, %%r13; " /* A[2]*B[2] */
1413 		"adcq  %%r8, %%r11 ;"
1414 		"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
1415 		"adcq %%r10, %%r13 ;"
1416 		/******************************************/
1417 		"adcq    $0, %%rax ;"
1418 
1419 		"addq  %%r9, %%rbx ;"
1420 		"adcq %%r11, %%rcx ;"
1421 		"adcq %%r13, %%r15 ;"
1422 		"adcq    $0, %%rax ;"
1423 
1424 		"movq 24(%1), %%rdx; "	/* A[3] */
1425 		"mulx   (%2),  %%r8,  %%r9; " /* A[3]*B[0] */
1426 		"addq %%rbx,  %%r8 ;"
1427 		"movq %%r8, 24(%0) ;"
1428 		"mulx  8(%2), %%r10, %%r11; " /* A[3]*B[1] */
1429 		"adcq %%r10,  %%r9 ;"
1430 		"mulx 16(%2),  %%r8, %%r13; " /* A[3]*B[2] */
1431 		"adcq  %%r8, %%r11 ;"
1432 		"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
1433 		"adcq %%r10, %%r13 ;"
1434 		/******************************************/
1435 		"adcq    $0, %%rbx ;"
1436 
1437 		"addq  %%r9, %%rcx ;"
1438 		"movq %%rcx, 32(%0) ;"
1439 		"adcq %%r11, %%r15 ;"
1440 		"movq %%r15, 40(%0) ;"
1441 		"adcq %%r13, %%rax ;"
1442 		"movq %%rax, 48(%0) ;"
1443 		"adcq    $0, %%rbx ;"
1444 		"movq %%rbx, 56(%0) ;"
1445 		:
1446 		: "r"(c), "r"(a), "r"(b)
1447 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1448 		  "%r10", "%r11", "%r13", "%r15");
1449 }
1450 
1451 static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
1452 {
1453 	asm volatile(
1454 		"movq   (%1), %%rdx        ;" /* A[0]      */
1455 		"mulx  8(%1),  %%r8, %%r14 ;" /* A[1]*A[0] */
1456 		"xorl %%r15d, %%r15d;"
1457 		"mulx 16(%1),  %%r9, %%r10 ;" /* A[2]*A[0] */
1458 		"adcx %%r14,  %%r9 ;"
1459 		"mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
1460 		"adcx %%rax, %%r10 ;"
1461 		"movq 24(%1), %%rdx        ;" /* A[3]      */
1462 		"mulx  8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
1463 		"adcx %%rcx, %%r11 ;"
1464 		"mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
1465 		"adcx %%rax, %%rbx ;"
1466 		"movq  8(%1), %%rdx        ;" /* A[1]      */
1467 		"adcx %%r15, %%r13 ;"
1468 		"mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
1469 		"movq    $0, %%r14 ;"
1470 		/******************************************/
1471 		"adcx %%r15, %%r14 ;"
1472 
1473 		"xorl %%r15d, %%r15d;"
1474 		"adox %%rax, %%r10 ;"
1475 		"adcx  %%r8,  %%r8 ;"
1476 		"adox %%rcx, %%r11 ;"
1477 		"adcx  %%r9,  %%r9 ;"
1478 		"adox %%r15, %%rbx ;"
1479 		"adcx %%r10, %%r10 ;"
1480 		"adox %%r15, %%r13 ;"
1481 		"adcx %%r11, %%r11 ;"
1482 		"adox %%r15, %%r14 ;"
1483 		"adcx %%rbx, %%rbx ;"
1484 		"adcx %%r13, %%r13 ;"
1485 		"adcx %%r14, %%r14 ;"
1486 
1487 		"movq   (%1), %%rdx ;"
1488 		"mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1489 		/*******************/
1490 		"movq %%rax,  0(%0) ;"
1491 		"addq %%rcx,  %%r8 ;"
1492 		"movq  %%r8,  8(%0) ;"
1493 		"movq  8(%1), %%rdx ;"
1494 		"mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1495 		"adcq %%rax,  %%r9 ;"
1496 		"movq  %%r9, 16(%0) ;"
1497 		"adcq %%rcx, %%r10 ;"
1498 		"movq %%r10, 24(%0) ;"
1499 		"movq 16(%1), %%rdx ;"
1500 		"mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1501 		"adcq %%rax, %%r11 ;"
1502 		"movq %%r11, 32(%0) ;"
1503 		"adcq %%rcx, %%rbx ;"
1504 		"movq %%rbx, 40(%0) ;"
1505 		"movq 24(%1), %%rdx ;"
1506 		"mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1507 		"adcq %%rax, %%r13 ;"
1508 		"movq %%r13, 48(%0) ;"
1509 		"adcq %%rcx, %%r14 ;"
1510 		"movq %%r14, 56(%0) ;"
1511 		:
1512 		: "r"(c), "r"(a)
1513 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1514 		  "%r10", "%r11", "%r13", "%r14", "%r15");
1515 }
1516 
1517 static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
1518 {
1519 	asm volatile(
1520 		"movq  8(%1), %%rdx        ;" /* A[1]      */
1521 		"mulx   (%1),  %%r8,  %%r9 ;" /* A[0]*A[1] */
1522 		"mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
1523 		"mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
1524 
1525 		"movq 16(%1), %%rdx        ;" /* A[2]      */
1526 		"mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
1527 		"mulx   (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
1528 
1529 		"addq %%rax,  %%r9 ;"
1530 		"adcq %%rdx, %%r10 ;"
1531 		"adcq %%rcx, %%r11 ;"
1532 		"adcq %%r14, %%r15 ;"
1533 		"adcq    $0, %%r13 ;"
1534 		"movq    $0, %%r14 ;"
1535 		"adcq    $0, %%r14 ;"
1536 
1537 		"movq   (%1), %%rdx        ;" /* A[0]      */
1538 		"mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
1539 
1540 		"addq %%rax, %%r10 ;"
1541 		"adcq %%rcx, %%r11 ;"
1542 		"adcq    $0, %%r15 ;"
1543 		"adcq    $0, %%r13 ;"
1544 		"adcq    $0, %%r14 ;"
1545 
1546 		"shldq $1, %%r13, %%r14 ;"
1547 		"shldq $1, %%r15, %%r13 ;"
1548 		"shldq $1, %%r11, %%r15 ;"
1549 		"shldq $1, %%r10, %%r11 ;"
1550 		"shldq $1,  %%r9, %%r10 ;"
1551 		"shldq $1,  %%r8,  %%r9 ;"
1552 		"shlq  $1,  %%r8        ;"
1553 
1554 		/*******************/
1555 		"mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
1556 		/*******************/
1557 		"movq %%rax,  0(%0) ;"
1558 		"addq %%rcx,  %%r8 ;"
1559 		"movq  %%r8,  8(%0) ;"
1560 		"movq  8(%1), %%rdx ;"
1561 		"mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
1562 		"adcq %%rax,  %%r9 ;"
1563 		"movq  %%r9, 16(%0) ;"
1564 		"adcq %%rcx, %%r10 ;"
1565 		"movq %%r10, 24(%0) ;"
1566 		"movq 16(%1), %%rdx ;"
1567 		"mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
1568 		"adcq %%rax, %%r11 ;"
1569 		"movq %%r11, 32(%0) ;"
1570 		"adcq %%rcx, %%r15 ;"
1571 		"movq %%r15, 40(%0) ;"
1572 		"movq 24(%1), %%rdx ;"
1573 		"mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
1574 		"adcq %%rax, %%r13 ;"
1575 		"movq %%r13, 48(%0) ;"
1576 		"adcq %%rcx, %%r14 ;"
1577 		"movq %%r14, 56(%0) ;"
1578 		:
1579 		: "r"(c), "r"(a)
1580 		: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1581 		  "%r11", "%r13", "%r14", "%r15");
1582 }
1583 
1584 static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1585 {
1586 	asm volatile(
1587 		"movl    $38, %%edx ;"	/* 2*c = 38 = 2^256 */
1588 		"mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1589 		"xorl %%ebx, %%ebx ;"
1590 		"adox   (%1),  %%r8 ;"
1591 		"mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1592 		"adcx %%r10,  %%r9 ;"
1593 		"adox  8(%1),  %%r9 ;"
1594 		"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1595 		"adcx %%r11, %%r10 ;"
1596 		"adox 16(%1), %%r10 ;"
1597 		"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1598 		"adcx %%rax, %%r11 ;"
1599 		"adox 24(%1), %%r11 ;"
1600 		/***************************************/
1601 		"adcx %%rbx, %%rcx ;"
1602 		"adox  %%rbx, %%rcx ;"
1603 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
1604 		"adcx %%rcx,  %%r8 ;"
1605 		"adcx %%rbx,  %%r9 ;"
1606 		"movq  %%r9,  8(%0) ;"
1607 		"adcx %%rbx, %%r10 ;"
1608 		"movq %%r10, 16(%0) ;"
1609 		"adcx %%rbx, %%r11 ;"
1610 		"movq %%r11, 24(%0) ;"
1611 		"mov     $0, %%ecx ;"
1612 		"cmovc %%edx, %%ecx ;"
1613 		"addq %%rcx,  %%r8 ;"
1614 		"movq  %%r8,   (%0) ;"
1615 		:
1616 		: "r"(c), "r"(a)
1617 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
1618 		  "%r10", "%r11");
1619 }
1620 
1621 static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1622 {
1623 	asm volatile(
1624 		"movl    $38, %%edx ;"	/* 2*c = 38 = 2^256 */
1625 		"mulx 32(%1),  %%r8, %%r10 ;" /* c*C[4] */
1626 		"mulx 40(%1),  %%r9, %%r11 ;" /* c*C[5] */
1627 		"addq %%r10,  %%r9 ;"
1628 		"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
1629 		"adcq %%r11, %%r10 ;"
1630 		"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
1631 		"adcq %%rax, %%r11 ;"
1632 		/***************************************/
1633 		"adcq    $0, %%rcx ;"
1634 		"addq   (%1),  %%r8 ;"
1635 		"adcq  8(%1),  %%r9 ;"
1636 		"adcq 16(%1), %%r10 ;"
1637 		"adcq 24(%1), %%r11 ;"
1638 		"adcq     $0, %%rcx ;"
1639 		"imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
1640 		"addq %%rcx,  %%r8 ;"
1641 		"adcq    $0,  %%r9 ;"
1642 		"movq  %%r9,  8(%0) ;"
1643 		"adcq    $0, %%r10 ;"
1644 		"movq %%r10, 16(%0) ;"
1645 		"adcq    $0, %%r11 ;"
1646 		"movq %%r11, 24(%0) ;"
1647 		"mov     $0, %%ecx ;"
1648 		"cmovc %%edx, %%ecx ;"
1649 		"addq %%rcx,  %%r8 ;"
1650 		"movq  %%r8,   (%0) ;"
1651 		:
1652 		: "r"(c), "r"(a)
1653 		: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1654 		  "%r11");
1655 }
1656 
1657 static __always_inline void
1658 add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
1659 {
1660 	asm volatile(
1661 		"mov     $38, %%eax ;"
1662 		"xorl  %%ecx, %%ecx ;"
1663 		"movq   (%2),  %%r8 ;"
1664 		"adcx   (%1),  %%r8 ;"
1665 		"movq  8(%2),  %%r9 ;"
1666 		"adcx  8(%1),  %%r9 ;"
1667 		"movq 16(%2), %%r10 ;"
1668 		"adcx 16(%1), %%r10 ;"
1669 		"movq 24(%2), %%r11 ;"
1670 		"adcx 24(%1), %%r11 ;"
1671 		"cmovc %%eax, %%ecx ;"
1672 		"xorl %%eax, %%eax  ;"
1673 		"adcx %%rcx,  %%r8  ;"
1674 		"adcx %%rax,  %%r9  ;"
1675 		"movq  %%r9,  8(%0) ;"
1676 		"adcx %%rax, %%r10  ;"
1677 		"movq %%r10, 16(%0) ;"
1678 		"adcx %%rax, %%r11  ;"
1679 		"movq %%r11, 24(%0) ;"
1680 		"mov     $38, %%ecx ;"
1681 		"cmovc %%ecx, %%eax ;"
1682 		"addq %%rax,  %%r8  ;"
1683 		"movq  %%r8,   (%0) ;"
1684 		:
1685 		: "r"(c), "r"(a), "r"(b)
1686 		: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1687 }
1688 
1689 static __always_inline void
1690 add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
1691 {
1692 	asm volatile(
1693 		"mov     $38, %%eax ;"
1694 		"movq   (%2),  %%r8 ;"
1695 		"addq   (%1),  %%r8 ;"
1696 		"movq  8(%2),  %%r9 ;"
1697 		"adcq  8(%1),  %%r9 ;"
1698 		"movq 16(%2), %%r10 ;"
1699 		"adcq 16(%1), %%r10 ;"
1700 		"movq 24(%2), %%r11 ;"
1701 		"adcq 24(%1), %%r11 ;"
1702 		"mov      $0, %%ecx ;"
1703 		"cmovc %%eax, %%ecx ;"
1704 		"addq %%rcx,  %%r8  ;"
1705 		"adcq    $0,  %%r9  ;"
1706 		"movq  %%r9,  8(%0) ;"
1707 		"adcq    $0, %%r10  ;"
1708 		"movq %%r10, 16(%0) ;"
1709 		"adcq    $0, %%r11  ;"
1710 		"movq %%r11, 24(%0) ;"
1711 		"mov     $0, %%ecx  ;"
1712 		"cmovc %%eax, %%ecx ;"
1713 		"addq %%rcx,  %%r8  ;"
1714 		"movq  %%r8,   (%0) ;"
1715 		:
1716 		: "r"(c), "r"(a), "r"(b)
1717 		: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1718 }
1719 
1720 static __always_inline void
1721 sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
1722 {
1723 	asm volatile(
1724 		"mov     $38, %%eax ;"
1725 		"movq   (%1),  %%r8 ;"
1726 		"subq   (%2),  %%r8 ;"
1727 		"movq  8(%1),  %%r9 ;"
1728 		"sbbq  8(%2),  %%r9 ;"
1729 		"movq 16(%1), %%r10 ;"
1730 		"sbbq 16(%2), %%r10 ;"
1731 		"movq 24(%1), %%r11 ;"
1732 		"sbbq 24(%2), %%r11 ;"
1733 		"mov      $0, %%ecx ;"
1734 		"cmovc %%eax, %%ecx ;"
1735 		"subq %%rcx,  %%r8  ;"
1736 		"sbbq    $0,  %%r9  ;"
1737 		"movq  %%r9,  8(%0) ;"
1738 		"sbbq    $0, %%r10  ;"
1739 		"movq %%r10, 16(%0) ;"
1740 		"sbbq    $0, %%r11  ;"
1741 		"movq %%r11, 24(%0) ;"
1742 		"mov     $0, %%ecx  ;"
1743 		"cmovc %%eax, %%ecx ;"
1744 		"subq %%rcx,  %%r8  ;"
1745 		"movq  %%r8,   (%0) ;"
1746 		:
1747 		: "r"(c), "r"(a), "r"(b)
1748 		: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
1749 }
1750 
1751 /* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
1752 static __always_inline void
1753 mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
1754 {
1755 	const u64 a24 = 121666;
1756 	asm volatile(
1757 		"movq     %2, %%rdx ;"
1758 		"mulx   (%1),  %%r8, %%r10 ;"
1759 		"mulx  8(%1),  %%r9, %%r11 ;"
1760 		"addq %%r10,  %%r9 ;"
1761 		"mulx 16(%1), %%r10, %%rax ;"
1762 		"adcq %%r11, %%r10 ;"
1763 		"mulx 24(%1), %%r11, %%rcx ;"
1764 		"adcq %%rax, %%r11 ;"
1765 		/**************************/
1766 		"adcq    $0, %%rcx ;"
1767 		"movl   $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
1768 		"imul %%rdx, %%rcx ;"
1769 		"addq %%rcx,  %%r8 ;"
1770 		"adcq    $0,  %%r9 ;"
1771 		"movq  %%r9,  8(%0) ;"
1772 		"adcq    $0, %%r10 ;"
1773 		"movq %%r10, 16(%0) ;"
1774 		"adcq    $0, %%r11 ;"
1775 		"movq %%r11, 24(%0) ;"
1776 		"mov     $0, %%ecx ;"
1777 		"cmovc %%edx, %%ecx ;"
1778 		"addq %%rcx,  %%r8 ;"
1779 		"movq  %%r8,   (%0) ;"
1780 		:
1781 		: "r"(c), "r"(a), "r"(a24)
1782 		: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
1783 		  "%r11");
1784 }
1785 
1786 static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
1787 {
1788 	struct {
1789 		eltfp25519_1w_buffer buffer;
1790 		eltfp25519_1w x0, x1, x2;
1791 	} __aligned(32) m;
1792 	u64 *T[4];
1793 
1794 	T[0] = m.x0;
1795 	T[1] = c; /* x^(-1) */
1796 	T[2] = m.x1;
1797 	T[3] = m.x2;
1798 
1799 	copy_eltfp25519_1w(T[1], a);
1800 	sqrn_eltfp25519_1w_adx(T[1], 1);
1801 	copy_eltfp25519_1w(T[2], T[1]);
1802 	sqrn_eltfp25519_1w_adx(T[2], 2);
1803 	mul_eltfp25519_1w_adx(T[0], a, T[2]);
1804 	mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
1805 	copy_eltfp25519_1w(T[2], T[1]);
1806 	sqrn_eltfp25519_1w_adx(T[2], 1);
1807 	mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1808 	copy_eltfp25519_1w(T[2], T[0]);
1809 	sqrn_eltfp25519_1w_adx(T[2], 5);
1810 	mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
1811 	copy_eltfp25519_1w(T[2], T[0]);
1812 	sqrn_eltfp25519_1w_adx(T[2], 10);
1813 	mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1814 	copy_eltfp25519_1w(T[3], T[2]);
1815 	sqrn_eltfp25519_1w_adx(T[3], 20);
1816 	mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
1817 	sqrn_eltfp25519_1w_adx(T[3], 10);
1818 	mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
1819 	copy_eltfp25519_1w(T[0], T[3]);
1820 	sqrn_eltfp25519_1w_adx(T[0], 50);
1821 	mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
1822 	copy_eltfp25519_1w(T[2], T[0]);
1823 	sqrn_eltfp25519_1w_adx(T[2], 100);
1824 	mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
1825 	sqrn_eltfp25519_1w_adx(T[2], 50);
1826 	mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
1827 	sqrn_eltfp25519_1w_adx(T[2], 5);
1828 	mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
1829 
1830 	memzero_explicit(&m, sizeof(m));
1831 }
1832 
1833 static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
1834 {
1835 	struct {
1836 		eltfp25519_1w_buffer buffer;
1837 		eltfp25519_1w x0, x1, x2;
1838 	} __aligned(32) m;
1839 	u64 *T[5];
1840 
1841 	T[0] = m.x0;
1842 	T[1] = c; /* x^(-1) */
1843 	T[2] = m.x1;
1844 	T[3] = m.x2;
1845 
1846 	copy_eltfp25519_1w(T[1], a);
1847 	sqrn_eltfp25519_1w_bmi2(T[1], 1);
1848 	copy_eltfp25519_1w(T[2], T[1]);
1849 	sqrn_eltfp25519_1w_bmi2(T[2], 2);
1850 	mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
1851 	mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
1852 	copy_eltfp25519_1w(T[2], T[1]);
1853 	sqrn_eltfp25519_1w_bmi2(T[2], 1);
1854 	mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1855 	copy_eltfp25519_1w(T[2], T[0]);
1856 	sqrn_eltfp25519_1w_bmi2(T[2], 5);
1857 	mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
1858 	copy_eltfp25519_1w(T[2], T[0]);
1859 	sqrn_eltfp25519_1w_bmi2(T[2], 10);
1860 	mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1861 	copy_eltfp25519_1w(T[3], T[2]);
1862 	sqrn_eltfp25519_1w_bmi2(T[3], 20);
1863 	mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
1864 	sqrn_eltfp25519_1w_bmi2(T[3], 10);
1865 	mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
1866 	copy_eltfp25519_1w(T[0], T[3]);
1867 	sqrn_eltfp25519_1w_bmi2(T[0], 50);
1868 	mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
1869 	copy_eltfp25519_1w(T[2], T[0]);
1870 	sqrn_eltfp25519_1w_bmi2(T[2], 100);
1871 	mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
1872 	sqrn_eltfp25519_1w_bmi2(T[2], 50);
1873 	mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
1874 	sqrn_eltfp25519_1w_bmi2(T[2], 5);
1875 	mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
1876 
1877 	memzero_explicit(&m, sizeof(m));
1878 }
1879 
1880 /* Given c, a 256-bit number, fred_eltfp25519_1w updates c
1881  * with a number such that 0 <= C < 2**255-19.
1882  */
1883 static __always_inline void fred_eltfp25519_1w(u64 *const c)
1884 {
1885 	u64 tmp0 = 38, tmp1 = 19;
1886 	asm volatile(
1887 		"btrq   $63,    %3 ;" /* Put bit 255 in carry flag and clear */
1888 		"cmovncl %k5,   %k4 ;" /* c[255] ? 38 : 19 */
1889 
1890 		/* Add either 19 or 38 to c */
1891 		"addq    %4,   %0 ;"
1892 		"adcq    $0,   %1 ;"
1893 		"adcq    $0,   %2 ;"
1894 		"adcq    $0,   %3 ;"
1895 
1896 		/* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
1897 		"movl    $0,  %k4 ;"
1898 		"cmovnsl %k5,  %k4 ;" /* c[255] ? 0 : 19 */
1899 		"btrq   $63,   %3 ;" /* Clear bit 255 */
1900 
1901 		/* Subtract 19 if necessary */
1902 		"subq    %4,   %0 ;"
1903 		"sbbq    $0,   %1 ;"
1904 		"sbbq    $0,   %2 ;"
1905 		"sbbq    $0,   %3 ;"
1906 
1907 		: "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
1908 		  "+r"(tmp1)
1909 		:
1910 		: "memory", "cc");
1911 }
1912 
1913 static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
1914 {
1915 	u64 temp;
1916 	asm volatile(
1917 		"test %9, %9 ;"
1918 		"movq %0, %8 ;"
1919 		"cmovnzq %4, %0 ;"
1920 		"cmovnzq %8, %4 ;"
1921 		"movq %1, %8 ;"
1922 		"cmovnzq %5, %1 ;"
1923 		"cmovnzq %8, %5 ;"
1924 		"movq %2, %8 ;"
1925 		"cmovnzq %6, %2 ;"
1926 		"cmovnzq %8, %6 ;"
1927 		"movq %3, %8 ;"
1928 		"cmovnzq %7, %3 ;"
1929 		"cmovnzq %8, %7 ;"
1930 		: "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
1931 		  "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
1932 		  "=r"(temp)
1933 		: "r"(bit)
1934 		: "cc"
1935 	);
1936 }
1937 
1938 static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
1939 {
1940 	asm volatile(
1941 		"test %4, %4 ;"
1942 		"cmovnzq %5, %0 ;"
1943 		"cmovnzq %6, %1 ;"
1944 		"cmovnzq %7, %2 ;"
1945 		"cmovnzq %8, %3 ;"
1946 		: "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
1947 		: "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
1948 		: "cc"
1949 	);
1950 }
1951 
1952 static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
1953 			   const u8 private_key[CURVE25519_KEY_SIZE],
1954 			   const u8 session_key[CURVE25519_KEY_SIZE])
1955 {
1956 	struct {
1957 		u64 buffer[4 * NUM_WORDS_ELTFP25519];
1958 		u64 coordinates[4 * NUM_WORDS_ELTFP25519];
1959 		u64 workspace[6 * NUM_WORDS_ELTFP25519];
1960 		u8 session[CURVE25519_KEY_SIZE];
1961 		u8 private[CURVE25519_KEY_SIZE];
1962 	} __aligned(32) m;
1963 
1964 	int i = 0, j = 0;
1965 	u64 prev = 0;
1966 	u64 *const X1 = (u64 *)m.session;
1967 	u64 *const key = (u64 *)m.private;
1968 	u64 *const Px = m.coordinates + 0;
1969 	u64 *const Pz = m.coordinates + 4;
1970 	u64 *const Qx = m.coordinates + 8;
1971 	u64 *const Qz = m.coordinates + 12;
1972 	u64 *const X2 = Qx;
1973 	u64 *const Z2 = Qz;
1974 	u64 *const X3 = Px;
1975 	u64 *const Z3 = Pz;
1976 	u64 *const X2Z2 = Qx;
1977 	u64 *const X3Z3 = Px;
1978 
1979 	u64 *const A = m.workspace + 0;
1980 	u64 *const B = m.workspace + 4;
1981 	u64 *const D = m.workspace + 8;
1982 	u64 *const C = m.workspace + 12;
1983 	u64 *const DA = m.workspace + 16;
1984 	u64 *const CB = m.workspace + 20;
1985 	u64 *const AB = A;
1986 	u64 *const DC = D;
1987 	u64 *const DACB = DA;
1988 
1989 	memcpy(m.private, private_key, sizeof(m.private));
1990 	memcpy(m.session, session_key, sizeof(m.session));
1991 
1992 	curve25519_clamp_secret(m.private);
1993 
1994 	/* As in the draft:
1995 	 * When receiving such an array, implementations of curve25519
1996 	 * MUST mask the most-significant bit in the final byte. This
1997 	 * is done to preserve compatibility with point formats which
1998 	 * reserve the sign bit for use in other protocols and to
1999 	 * increase resistance to implementation fingerprinting
2000 	 */
2001 	m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2002 
2003 	copy_eltfp25519_1w(Px, X1);
2004 	setzero_eltfp25519_1w(Pz);
2005 	setzero_eltfp25519_1w(Qx);
2006 	setzero_eltfp25519_1w(Qz);
2007 
2008 	Pz[0] = 1;
2009 	Qx[0] = 1;
2010 
2011 	/* main-loop */
2012 	prev = 0;
2013 	j = 62;
2014 	for (i = 3; i >= 0; --i) {
2015 		while (j >= 0) {
2016 			u64 bit = (key[i] >> j) & 0x1;
2017 			u64 swap = bit ^ prev;
2018 			prev = bit;
2019 
2020 			add_eltfp25519_1w_adx(A, X2, Z2);	/* A = (X2+Z2) */
2021 			sub_eltfp25519_1w(B, X2, Z2);		/* B = (X2-Z2) */
2022 			add_eltfp25519_1w_adx(C, X3, Z3);	/* C = (X3+Z3) */
2023 			sub_eltfp25519_1w(D, X3, Z3);		/* D = (X3-Z3) */
2024 			mul_eltfp25519_2w_adx(DACB, AB, DC);	/* [DA|CB] = [A|B]*[D|C] */
2025 
2026 			cselect(swap, A, C);
2027 			cselect(swap, B, D);
2028 
2029 			sqr_eltfp25519_2w_adx(AB);		/* [AA|BB] = [A^2|B^2] */
2030 			add_eltfp25519_1w_adx(X3, DA, CB);	/* X3 = (DA+CB) */
2031 			sub_eltfp25519_1w(Z3, DA, CB);		/* Z3 = (DA-CB) */
2032 			sqr_eltfp25519_2w_adx(X3Z3);		/* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2033 
2034 			copy_eltfp25519_1w(X2, B);		/* X2 = B^2 */
2035 			sub_eltfp25519_1w(Z2, A, B);		/* Z2 = E = AA-BB */
2036 
2037 			mul_a24_eltfp25519_1w(B, Z2);		/* B = a24*E */
2038 			add_eltfp25519_1w_adx(B, B, X2);	/* B = a24*E+B */
2039 			mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB);	/* [X2|Z2] = [B|E]*[A|a24*E+B] */
2040 			mul_eltfp25519_1w_adx(Z3, Z3, X1);	/* Z3 = Z3*X1 */
2041 			--j;
2042 		}
2043 		j = 63;
2044 	}
2045 
2046 	inv_eltfp25519_1w_adx(A, Qz);
2047 	mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
2048 	fred_eltfp25519_1w((u64 *)shared);
2049 
2050 	memzero_explicit(&m, sizeof(m));
2051 }
2052 
2053 static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
2054 				const u8 private_key[CURVE25519_KEY_SIZE])
2055 {
2056 	struct {
2057 		u64 buffer[4 * NUM_WORDS_ELTFP25519];
2058 		u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2059 		u64 workspace[4 * NUM_WORDS_ELTFP25519];
2060 		u8 private[CURVE25519_KEY_SIZE];
2061 	} __aligned(32) m;
2062 
2063 	const int ite[4] = { 64, 64, 64, 63 };
2064 	const int q = 3;
2065 	u64 swap = 1;
2066 
2067 	int i = 0, j = 0, k = 0;
2068 	u64 *const key = (u64 *)m.private;
2069 	u64 *const Ur1 = m.coordinates + 0;
2070 	u64 *const Zr1 = m.coordinates + 4;
2071 	u64 *const Ur2 = m.coordinates + 8;
2072 	u64 *const Zr2 = m.coordinates + 12;
2073 
2074 	u64 *const UZr1 = m.coordinates + 0;
2075 	u64 *const ZUr2 = m.coordinates + 8;
2076 
2077 	u64 *const A = m.workspace + 0;
2078 	u64 *const B = m.workspace + 4;
2079 	u64 *const C = m.workspace + 8;
2080 	u64 *const D = m.workspace + 12;
2081 
2082 	u64 *const AB = m.workspace + 0;
2083 	u64 *const CD = m.workspace + 8;
2084 
2085 	const u64 *const P = table_ladder_8k;
2086 
2087 	memcpy(m.private, private_key, sizeof(m.private));
2088 
2089 	curve25519_clamp_secret(m.private);
2090 
2091 	setzero_eltfp25519_1w(Ur1);
2092 	setzero_eltfp25519_1w(Zr1);
2093 	setzero_eltfp25519_1w(Zr2);
2094 	Ur1[0] = 1;
2095 	Zr1[0] = 1;
2096 	Zr2[0] = 1;
2097 
2098 	/* G-S */
2099 	Ur2[3] = 0x1eaecdeee27cab34UL;
2100 	Ur2[2] = 0xadc7a0b9235d48e2UL;
2101 	Ur2[1] = 0xbbf095ae14b2edf8UL;
2102 	Ur2[0] = 0x7e94e1fec82faabdUL;
2103 
2104 	/* main-loop */
2105 	j = q;
2106 	for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2107 		while (j < ite[i]) {
2108 			u64 bit = (key[i] >> j) & 0x1;
2109 			k = (64 * i + j - q);
2110 			swap = swap ^ bit;
2111 			cswap(swap, Ur1, Ur2);
2112 			cswap(swap, Zr1, Zr2);
2113 			swap = bit;
2114 			/* Addition */
2115 			sub_eltfp25519_1w(B, Ur1, Zr1);		/* B = Ur1-Zr1 */
2116 			add_eltfp25519_1w_adx(A, Ur1, Zr1);	/* A = Ur1+Zr1 */
2117 			mul_eltfp25519_1w_adx(C, &P[4 * k], B);	/* C = M0-B */
2118 			sub_eltfp25519_1w(B, A, C);		/* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2119 			add_eltfp25519_1w_adx(A, A, C);		/* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2120 			sqr_eltfp25519_2w_adx(AB);		/* A = A^2      |  B = B^2 */
2121 			mul_eltfp25519_2w_adx(UZr1, ZUr2, AB);	/* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
2122 			++j;
2123 		}
2124 		j = 0;
2125 	}
2126 
2127 	/* Doubling */
2128 	for (i = 0; i < q; ++i) {
2129 		add_eltfp25519_1w_adx(A, Ur1, Zr1);	/*  A = Ur1+Zr1 */
2130 		sub_eltfp25519_1w(B, Ur1, Zr1);		/*  B = Ur1-Zr1 */
2131 		sqr_eltfp25519_2w_adx(AB);		/*  A = A**2     B = B**2 */
2132 		copy_eltfp25519_1w(C, B);		/*  C = B */
2133 		sub_eltfp25519_1w(B, A, B);		/*  B = A-B */
2134 		mul_a24_eltfp25519_1w(D, B);		/*  D = my_a24*B */
2135 		add_eltfp25519_1w_adx(D, D, C);		/*  D = D+C */
2136 		mul_eltfp25519_2w_adx(UZr1, AB, CD);	/*  Ur1 = A*B   Zr1 = Zr1*A */
2137 	}
2138 
2139 	/* Convert to affine coordinates */
2140 	inv_eltfp25519_1w_adx(A, Zr1);
2141 	mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
2142 	fred_eltfp25519_1w((u64 *)session_key);
2143 
2144 	memzero_explicit(&m, sizeof(m));
2145 }
2146 
2147 static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
2148 			    const u8 private_key[CURVE25519_KEY_SIZE],
2149 			    const u8 session_key[CURVE25519_KEY_SIZE])
2150 {
2151 	struct {
2152 		u64 buffer[4 * NUM_WORDS_ELTFP25519];
2153 		u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2154 		u64 workspace[6 * NUM_WORDS_ELTFP25519];
2155 		u8 session[CURVE25519_KEY_SIZE];
2156 		u8 private[CURVE25519_KEY_SIZE];
2157 	} __aligned(32) m;
2158 
2159 	int i = 0, j = 0;
2160 	u64 prev = 0;
2161 	u64 *const X1 = (u64 *)m.session;
2162 	u64 *const key = (u64 *)m.private;
2163 	u64 *const Px = m.coordinates + 0;
2164 	u64 *const Pz = m.coordinates + 4;
2165 	u64 *const Qx = m.coordinates + 8;
2166 	u64 *const Qz = m.coordinates + 12;
2167 	u64 *const X2 = Qx;
2168 	u64 *const Z2 = Qz;
2169 	u64 *const X3 = Px;
2170 	u64 *const Z3 = Pz;
2171 	u64 *const X2Z2 = Qx;
2172 	u64 *const X3Z3 = Px;
2173 
2174 	u64 *const A = m.workspace + 0;
2175 	u64 *const B = m.workspace + 4;
2176 	u64 *const D = m.workspace + 8;
2177 	u64 *const C = m.workspace + 12;
2178 	u64 *const DA = m.workspace + 16;
2179 	u64 *const CB = m.workspace + 20;
2180 	u64 *const AB = A;
2181 	u64 *const DC = D;
2182 	u64 *const DACB = DA;
2183 
2184 	memcpy(m.private, private_key, sizeof(m.private));
2185 	memcpy(m.session, session_key, sizeof(m.session));
2186 
2187 	curve25519_clamp_secret(m.private);
2188 
2189 	/* As in the draft:
2190 	 * When receiving such an array, implementations of curve25519
2191 	 * MUST mask the most-significant bit in the final byte. This
2192 	 * is done to preserve compatibility with point formats which
2193 	 * reserve the sign bit for use in other protocols and to
2194 	 * increase resistance to implementation fingerprinting
2195 	 */
2196 	m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
2197 
2198 	copy_eltfp25519_1w(Px, X1);
2199 	setzero_eltfp25519_1w(Pz);
2200 	setzero_eltfp25519_1w(Qx);
2201 	setzero_eltfp25519_1w(Qz);
2202 
2203 	Pz[0] = 1;
2204 	Qx[0] = 1;
2205 
2206 	/* main-loop */
2207 	prev = 0;
2208 	j = 62;
2209 	for (i = 3; i >= 0; --i) {
2210 		while (j >= 0) {
2211 			u64 bit = (key[i] >> j) & 0x1;
2212 			u64 swap = bit ^ prev;
2213 			prev = bit;
2214 
2215 			add_eltfp25519_1w_bmi2(A, X2, Z2);	/* A = (X2+Z2) */
2216 			sub_eltfp25519_1w(B, X2, Z2);		/* B = (X2-Z2) */
2217 			add_eltfp25519_1w_bmi2(C, X3, Z3);	/* C = (X3+Z3) */
2218 			sub_eltfp25519_1w(D, X3, Z3);		/* D = (X3-Z3) */
2219 			mul_eltfp25519_2w_bmi2(DACB, AB, DC);	/* [DA|CB] = [A|B]*[D|C] */
2220 
2221 			cselect(swap, A, C);
2222 			cselect(swap, B, D);
2223 
2224 			sqr_eltfp25519_2w_bmi2(AB);		/* [AA|BB] = [A^2|B^2] */
2225 			add_eltfp25519_1w_bmi2(X3, DA, CB);	/* X3 = (DA+CB) */
2226 			sub_eltfp25519_1w(Z3, DA, CB);		/* Z3 = (DA-CB) */
2227 			sqr_eltfp25519_2w_bmi2(X3Z3);		/* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
2228 
2229 			copy_eltfp25519_1w(X2, B);		/* X2 = B^2 */
2230 			sub_eltfp25519_1w(Z2, A, B);		/* Z2 = E = AA-BB */
2231 
2232 			mul_a24_eltfp25519_1w(B, Z2);		/* B = a24*E */
2233 			add_eltfp25519_1w_bmi2(B, B, X2);	/* B = a24*E+B */
2234 			mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB);	/* [X2|Z2] = [B|E]*[A|a24*E+B] */
2235 			mul_eltfp25519_1w_bmi2(Z3, Z3, X1);	/* Z3 = Z3*X1 */
2236 			--j;
2237 		}
2238 		j = 63;
2239 	}
2240 
2241 	inv_eltfp25519_1w_bmi2(A, Qz);
2242 	mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
2243 	fred_eltfp25519_1w((u64 *)shared);
2244 
2245 	memzero_explicit(&m, sizeof(m));
2246 }
2247 
2248 static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
2249 				 const u8 private_key[CURVE25519_KEY_SIZE])
2250 {
2251 	struct {
2252 		u64 buffer[4 * NUM_WORDS_ELTFP25519];
2253 		u64 coordinates[4 * NUM_WORDS_ELTFP25519];
2254 		u64 workspace[4 * NUM_WORDS_ELTFP25519];
2255 		u8 private[CURVE25519_KEY_SIZE];
2256 	} __aligned(32) m;
2257 
2258 	const int ite[4] = { 64, 64, 64, 63 };
2259 	const int q = 3;
2260 	u64 swap = 1;
2261 
2262 	int i = 0, j = 0, k = 0;
2263 	u64 *const key = (u64 *)m.private;
2264 	u64 *const Ur1 = m.coordinates + 0;
2265 	u64 *const Zr1 = m.coordinates + 4;
2266 	u64 *const Ur2 = m.coordinates + 8;
2267 	u64 *const Zr2 = m.coordinates + 12;
2268 
2269 	u64 *const UZr1 = m.coordinates + 0;
2270 	u64 *const ZUr2 = m.coordinates + 8;
2271 
2272 	u64 *const A = m.workspace + 0;
2273 	u64 *const B = m.workspace + 4;
2274 	u64 *const C = m.workspace + 8;
2275 	u64 *const D = m.workspace + 12;
2276 
2277 	u64 *const AB = m.workspace + 0;
2278 	u64 *const CD = m.workspace + 8;
2279 
2280 	const u64 *const P = table_ladder_8k;
2281 
2282 	memcpy(m.private, private_key, sizeof(m.private));
2283 
2284 	curve25519_clamp_secret(m.private);
2285 
2286 	setzero_eltfp25519_1w(Ur1);
2287 	setzero_eltfp25519_1w(Zr1);
2288 	setzero_eltfp25519_1w(Zr2);
2289 	Ur1[0] = 1;
2290 	Zr1[0] = 1;
2291 	Zr2[0] = 1;
2292 
2293 	/* G-S */
2294 	Ur2[3] = 0x1eaecdeee27cab34UL;
2295 	Ur2[2] = 0xadc7a0b9235d48e2UL;
2296 	Ur2[1] = 0xbbf095ae14b2edf8UL;
2297 	Ur2[0] = 0x7e94e1fec82faabdUL;
2298 
2299 	/* main-loop */
2300 	j = q;
2301 	for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
2302 		while (j < ite[i]) {
2303 			u64 bit = (key[i] >> j) & 0x1;
2304 			k = (64 * i + j - q);
2305 			swap = swap ^ bit;
2306 			cswap(swap, Ur1, Ur2);
2307 			cswap(swap, Zr1, Zr2);
2308 			swap = bit;
2309 			/* Addition */
2310 			sub_eltfp25519_1w(B, Ur1, Zr1);		/* B = Ur1-Zr1 */
2311 			add_eltfp25519_1w_bmi2(A, Ur1, Zr1);	/* A = Ur1+Zr1 */
2312 			mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
2313 			sub_eltfp25519_1w(B, A, C);		/* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
2314 			add_eltfp25519_1w_bmi2(A, A, C);	/* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
2315 			sqr_eltfp25519_2w_bmi2(AB);		/* A = A^2      |  B = B^2 */
2316 			mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB);	/* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
2317 			++j;
2318 		}
2319 		j = 0;
2320 	}
2321 
2322 	/* Doubling */
2323 	for (i = 0; i < q; ++i) {
2324 		add_eltfp25519_1w_bmi2(A, Ur1, Zr1);	/*  A = Ur1+Zr1 */
2325 		sub_eltfp25519_1w(B, Ur1, Zr1);		/*  B = Ur1-Zr1 */
2326 		sqr_eltfp25519_2w_bmi2(AB);		/*  A = A**2     B = B**2 */
2327 		copy_eltfp25519_1w(C, B);		/*  C = B */
2328 		sub_eltfp25519_1w(B, A, B);		/*  B = A-B */
2329 		mul_a24_eltfp25519_1w(D, B);		/*  D = my_a24*B */
2330 		add_eltfp25519_1w_bmi2(D, D, C);	/*  D = D+C */
2331 		mul_eltfp25519_2w_bmi2(UZr1, AB, CD);	/*  Ur1 = A*B   Zr1 = Zr1*A */
2332 	}
2333 
2334 	/* Convert to affine coordinates */
2335 	inv_eltfp25519_1w_bmi2(A, Zr1);
2336 	mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
2337 	fred_eltfp25519_1w((u64 *)session_key);
2338 
2339 	memzero_explicit(&m, sizeof(m));
2340 }
2341 
2342 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
2343 		     const u8 secret[CURVE25519_KEY_SIZE],
2344 		     const u8 basepoint[CURVE25519_KEY_SIZE])
2345 {
2346 	if (static_branch_likely(&curve25519_use_adx))
2347 		curve25519_adx(mypublic, secret, basepoint);
2348 	else if (static_branch_likely(&curve25519_use_bmi2))
2349 		curve25519_bmi2(mypublic, secret, basepoint);
2350 	else
2351 		curve25519_generic(mypublic, secret, basepoint);
2352 }
2353 EXPORT_SYMBOL(curve25519_arch);
2354 
2355 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
2356 			  const u8 secret[CURVE25519_KEY_SIZE])
2357 {
2358 	if (static_branch_likely(&curve25519_use_adx))
2359 		curve25519_adx_base(pub, secret);
2360 	else if (static_branch_likely(&curve25519_use_bmi2))
2361 		curve25519_bmi2_base(pub, secret);
2362 	else
2363 		curve25519_generic(pub, secret, curve25519_base_point);
2364 }
2365 EXPORT_SYMBOL(curve25519_base_arch);
2366 
2367 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
2368 				 unsigned int len)
2369 {
2370 	u8 *secret = kpp_tfm_ctx(tfm);
2371 
2372 	if (!len)
2373 		curve25519_generate_secret(secret);
2374 	else if (len == CURVE25519_KEY_SIZE &&
2375 		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
2376 		memcpy(secret, buf, CURVE25519_KEY_SIZE);
2377 	else
2378 		return -EINVAL;
2379 	return 0;
2380 }
2381 
2382 static int curve25519_generate_public_key(struct kpp_request *req)
2383 {
2384 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2385 	const u8 *secret = kpp_tfm_ctx(tfm);
2386 	u8 buf[CURVE25519_KEY_SIZE];
2387 	int copied, nbytes;
2388 
2389 	if (req->src)
2390 		return -EINVAL;
2391 
2392 	curve25519_base_arch(buf, secret);
2393 
2394 	/* might want less than we've got */
2395 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2396 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2397 								nbytes),
2398 				     buf, nbytes);
2399 	if (copied != nbytes)
2400 		return -EINVAL;
2401 	return 0;
2402 }
2403 
2404 static int curve25519_compute_shared_secret(struct kpp_request *req)
2405 {
2406 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
2407 	const u8 *secret = kpp_tfm_ctx(tfm);
2408 	u8 public_key[CURVE25519_KEY_SIZE];
2409 	u8 buf[CURVE25519_KEY_SIZE];
2410 	int copied, nbytes;
2411 
2412 	if (!req->src)
2413 		return -EINVAL;
2414 
2415 	copied = sg_copy_to_buffer(req->src,
2416 				   sg_nents_for_len(req->src,
2417 						    CURVE25519_KEY_SIZE),
2418 				   public_key, CURVE25519_KEY_SIZE);
2419 	if (copied != CURVE25519_KEY_SIZE)
2420 		return -EINVAL;
2421 
2422 	curve25519_arch(buf, secret, public_key);
2423 
2424 	/* might want less than we've got */
2425 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
2426 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
2427 								nbytes),
2428 				     buf, nbytes);
2429 	if (copied != nbytes)
2430 		return -EINVAL;
2431 	return 0;
2432 }
2433 
2434 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
2435 {
2436 	return CURVE25519_KEY_SIZE;
2437 }
2438 
2439 static struct kpp_alg curve25519_alg = {
2440 	.base.cra_name		= "curve25519",
2441 	.base.cra_driver_name	= "curve25519-x86",
2442 	.base.cra_priority	= 200,
2443 	.base.cra_module	= THIS_MODULE,
2444 	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
2445 
2446 	.set_secret		= curve25519_set_secret,
2447 	.generate_public_key	= curve25519_generate_public_key,
2448 	.compute_shared_secret	= curve25519_compute_shared_secret,
2449 	.max_size		= curve25519_max_size,
2450 };
2451 
2452 static int __init curve25519_mod_init(void)
2453 {
2454 	if (boot_cpu_has(X86_FEATURE_BMI2))
2455 		static_branch_enable(&curve25519_use_bmi2);
2456 	else if (boot_cpu_has(X86_FEATURE_ADX))
2457 		static_branch_enable(&curve25519_use_adx);
2458 	else
2459 		return 0;
2460 	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
2461 		crypto_register_kpp(&curve25519_alg) : 0;
2462 }
2463 
2464 static void __exit curve25519_mod_exit(void)
2465 {
2466 	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
2467 	    (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
2468 		crypto_unregister_kpp(&curve25519_alg);
2469 }
2470 
2471 module_init(curve25519_mod_init);
2472 module_exit(curve25519_mod_exit);
2473 
2474 MODULE_ALIAS_CRYPTO("curve25519");
2475 MODULE_ALIAS_CRYPTO("curve25519-x86");
2476 MODULE_LICENSE("GPL v2");
2477