1
2 /***************************************************************************
3 *
4 Copyright 2012 CertiVox IOM Ltd. *
5 *
6 This file is part of CertiVox MIRACL Crypto SDK. *
7 *
8 The CertiVox MIRACL Crypto SDK provides developers with an *
9 extensive and efficient set of cryptographic functions. *
10 For further information about its features and functionalities please *
11 refer to http://www.certivox.com *
12 *
13 * The CertiVox MIRACL Crypto SDK is free software: you can *
14 redistribute it and/or modify it under the terms of the *
15 GNU Affero General Public License as published by the *
16 Free Software Foundation, either version 3 of the License, *
17 or (at your option) any later version. *
18 *
19 * The CertiVox MIRACL Crypto SDK is distributed in the hope *
20 that it will be useful, but WITHOUT ANY WARRANTY; without even the *
21 implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
22 See the GNU Affero General Public License for more details. *
23 *
24 * You should have received a copy of the GNU Affero General Public *
25 License along with CertiVox MIRACL Crypto SDK. *
26 If not, see <http://www.gnu.org/licenses/>. *
27 *
28 You can be released from the requirements of the license by purchasing *
29 a commercial license. Buying such a license is mandatory as soon as you *
30 develop commercial activities involving the CertiVox MIRACL Crypto SDK *
31 without disclosing the source code of your own applications, or shipping *
32 the CertiVox MIRACL Crypto SDK with a closed source product. *
33 *
34 ***************************************************************************/
35 /*
36 * MIRACL routines for arithmetic over GF(2^m),
37 * mrgf2m.c
38 *
39 * For algorithms used, see IEEE P1363 Standard, Appendix A
40 * unless otherwise stated.
41 *
42 * The time-critical routines are the multiplication routine multiply2()
43 * and (for AFFINE co-ordinates), the modular inverse routine inverse2()
44 * and the routines it calls.
45 *
46 * READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
47 *
48 * No assembly language used.
49 *
50 * Use utility irp.cpp to generate optimal code for function reduce2(.) below
51 *
52 * Space can be saved by removing unneeded functions and
53 * deleting unrequired functionality.
54 * For example in reduce2(.) remove code for those irreducible polynomials
55 * which will not be used by your code.
56 */
57
58 #include <stdlib.h>
59 #include "miracl.h"
60 #ifdef MR_STATIC
61 #include <string.h>
62 #endif
63
64 #ifdef MR_COUNT_OPS
65 extern int fpm2,fpi2;
66 #endif
67
68 /* must use /arch:SSE2 in compilation */
69
70 #ifdef _M_IX86_FP
71 #if _M_IX86_FP >= 2
72 #define MR_SSE2_INTRINSICS
73 #endif
74 #endif
75
76 /* must use -msse2 in compilation */
77
78 #ifdef __SSE2__
79 #define MR_SSE2_INTRINSICS
80 #endif
81
82 #ifdef MR_SSE2_INTRINSICS
83 #ifdef __GNUC__
84 #include <xmmintrin.h>
85 #else
86 #include <emmintrin.h>
87 #endif
88
89 #if MIRACL==64
90 #define MR_SSE2_64
91 /* Can use SSE2 registers for 64-bit manipulations */
92 #endif
93
94 #endif
95
96 #ifndef MR_NOFULLWIDTH
97 /* This does not make sense using floating-point! */
98
99 /* This is extremely time-critical, and expensive */
100
101 /* Some experimental MMX code for x86-32. Seems to be slower than the standard code (on a PIV anyway).. */
102
103 #ifdef MR_MMX_x86_32
104
105 #ifdef __GNUC__
106 #include <xmmintrin.h>
107 #else
108 #include <emmintrin.h>
109 #endif
110
mr_mul2(mr_small a,mr_small b,mr_small * r)111 static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
112 {
113 __m64 rg,tt[4];
114 mr_small q;
115
116 tt[0]=_m_from_int(0);
117 tt[1]=_m_from_int(a);
118 tt[2]=_m_psllqi(tt[1],1);
119 tt[3]=_m_pxor(tt[1],tt[2]);
120
121 rg=tt[b&3];
122 rg=_m_pxor(rg,_m_psllqi(tt[(b>>2)&3],2));
123 rg=_m_pxor(rg,_m_psllqi(tt[(b>>4)&3],4));
124 rg=_m_pxor(rg,_m_psllqi(tt[(b>>6)&3],6));
125 rg=_m_pxor(rg,_m_psllqi(tt[(b>>8)&3],8));
126 rg=_m_pxor(rg,_m_psllqi(tt[(b>>10)&3],10));
127 rg=_m_pxor(rg,_m_psllqi(tt[(b>>12)&3],12));
128 rg=_m_pxor(rg,_m_psllqi(tt[(b>>14)&3],14));
129 rg=_m_pxor(rg,_m_psllqi(tt[(b>>16)&3],16));
130 rg=_m_pxor(rg,_m_psllqi(tt[(b>>18)&3],18));
131 rg=_m_pxor(rg,_m_psllqi(tt[(b>>20)&3],20));
132 rg=_m_pxor(rg,_m_psllqi(tt[(b>>22)&3],22));
133 rg=_m_pxor(rg,_m_psllqi(tt[(b>>24)&3],24));
134 rg=_m_pxor(rg,_m_psllqi(tt[(b>>26)&3],26));
135 rg=_m_pxor(rg,_m_psllqi(tt[(b>>28)&3],28));
136 rg=_m_pxor(rg,_m_psllqi(tt[(b>>30)],30));
137
138 *r=_m_to_int(rg);
139 q=_m_to_int(_m_psrlqi(rg,32));
140
141 return q;
142 }
143
144 #else
145
146 /* This might be faster on a 16-bit processor with no variable shift instructions.
147 The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit right shift on
148 the hi|lo value - should be really fast in assembly language
149
150 unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
151 {
152 unsigned short lo,hi,bit,w;
153 hi=0;
154 lo=x;
155 bit=-(lo&1);
156 lo>>=1;
157
158 hi^=(y&bit); bit=-(lo&1);
159 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
160
161 hi^=(y&bit); bit=-(lo&1);
162 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
163
164 hi^=(y&bit); bit=-(lo&1);
165 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
166
167 hi^=(y&bit); bit=-(lo&1);
168 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
169
170 hi^=(y&bit); bit=-(lo&1);
171 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
172
173 hi^=(y&bit); bit=-(lo&1);
174 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
175
176 hi^=(y&bit); bit=-(lo&1);
177 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
178
179 hi^=(y&bit); bit=-(lo&1);
180 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
181
182 hi^=(y&bit); bit=-(lo&1);
183 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
184
185 hi^=(y&bit); bit=-(lo&1);
186 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
187
188 hi^=(y&bit); bit=-(lo&1);
189 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
190
191 hi^=(y&bit); bit=-(lo&1);
192 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
193
194 hi^=(y&bit); bit=-(lo&1);
195 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
196
197 hi^=(y&bit); bit=-(lo&1);
198 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
199
200 hi^=(y&bit); bit=-(lo&1);
201 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
202
203 hi^=(y&bit);
204 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);
205
206 *r=lo;
207 return hi;
208 }
209
210 */
211
212
213 /* This might be faster on an 8-bit processor with no variable shift instructions.
214 The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right shift on
215 the hi|lo value - should be really fast in assembly language
216
217 unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
218 {
219 unsigned char lo,hi,bit,w;
220 hi=0;
221 lo=x;
222 bit=-(lo&1);
223 lo>>=1;
224
225 hi^=(y&bit); bit=-(lo&1);
226 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
227
228 hi^=(y&bit); bit=-(lo&1);
229 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
230
231 hi^=(y&bit); bit=-(lo&1);
232 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
233
234 hi^=(y&bit); bit=-(lo&1);
235 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
236
237 hi^=(y&bit); bit=-(lo&1);
238 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
239
240 hi^=(y&bit); bit=-(lo&1);
241 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
242
243 hi^=(y&bit); bit=-(lo&1);
244 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
245
246 hi^=(y&bit);
247 w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);
248
249 *r=lo;
250 return hi;
251 }
252
253 */
254
255 /* wouldn't it be nice if instruction sets supported a
256 one cycle "carry-free" multiplication instruction ...
257 The SmartMips does - its called maddp */
258
259 #ifndef MR_COMBA2
260
261 #if MIRACL==8
262
263 /* maybe use a small precomputed look-up table? */
264
mr_mul2(mr_small a,mr_small b,mr_small * r)265 static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
266 {
267 static const mr_small look[256]=
268 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
269 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
270 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,
271 0,3,6,5,12,15,10,9,24,27,30,29,20,23,18,17,
272 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,
273 0,5,10,15,20,17,30,27,40,45,34,39,60,57,54,51,
274 0,6,12,10,24,30,20,18,48,54,60,58,40,46,36,34,
275 0,7,14,9,28,27,18,21,56,63,54,49,36,35,42,45,
276 0,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,
277 0,9,18,27,36,45,54,63,72,65,90,83,108,101,126,119,
278 0,10,20,30,40,34,60,54,80,90,68,78,120,114,108,102,
279 0,11,22,29,44,39,58,49,88,83,78,69,116,127,98,105,
280 0,12,24,20,48,60,40,36,96,108,120,116,80,92,72,68,
281 0,13,26,23,52,57,46,35,104,101,114,127,92,81,70,75,
282 0,14,28,18,56,54,36,42,112,126,108,98,72,70,84,90,
283 0,15,30,17,60,51,34,45,120,119,102,105,68,75,90,85
284 };
285
286 mr_small x1,y0,m,p,q;
287 x1=a&0xf0;
288 y0=b&0x0f;
289 a<<=4;
290 b>>=4;
291
292 p=look[(a|y0)];
293 q=look[(x1|b)];
294
295 m=look[a^b^x1^y0]^p^q; /* Karatsuba! */
296
297 p^=(m<<4);
298 q^=(m>>4);
299
300 *r=p;
301 return q;
302 }
303
304 #else
305
306 #ifdef MR_SSE2_64
307
mr_mul2(mr_small a,mr_small b,mr_small * r)308 static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
309 {
310 int i,j;
311 __m128i pp,tt[16],m;
312
313 m=_mm_set_epi32(0,0,0xf0<<24,0);
314
315 tt[0]=_mm_setzero_si128();
316 tt[1]=_mm_loadl_epi64((__m128i *)&a);
317 tt[2]=_mm_xor_si128(_mm_slli_epi64(tt[1],1),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,7));
318 tt[4]=_mm_xor_si128(_mm_slli_epi64(tt[1],2),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,6));
319 tt[8]=_mm_xor_si128(_mm_slli_epi64(tt[1],3),_mm_srli_epi64( _mm_slli_si128(_mm_and_si128(m ,tt[1]),1) ,5));
320 tt[3]=_mm_xor_si128(tt[1],tt[2]);
321 tt[5]=_mm_xor_si128(tt[1],tt[4]);
322 tt[6]=_mm_xor_si128(tt[2],tt[4]);
323 tt[7]=_mm_xor_si128(tt[6],tt[1]);
324 tt[9]=_mm_xor_si128(tt[8],tt[1]);
325 tt[10]=_mm_xor_si128(tt[8],tt[2]);
326 tt[11]=_mm_xor_si128(tt[10],tt[1]);
327 tt[12]=_mm_xor_si128(tt[8],tt[4]);
328 tt[13]=_mm_xor_si128(tt[12],tt[1]);
329 tt[14]=_mm_xor_si128(tt[8],tt[6]);
330 tt[15]=_mm_xor_si128(tt[14],tt[1]);
331
332 /* Thanks to Darrel Hankerson, who pointed out an optimization for this code ... */
333
334 i=(int)(b&0xF); j=(int)((b>>4)&0xF);
335 pp=_mm_xor_si128(tt[i], _mm_or_si128(_mm_slli_epi64(tt[j],4),
336 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60)) );
337 i=(int)((b>>8)&0xF); j=(int)((b>>12)&0xF);
338
339 pp=_mm_xor_si128(pp, _mm_slli_si128( _mm_xor_si128(tt[i],
340 _mm_or_si128(_mm_slli_epi64(tt[j],4),
341 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
342 ) ,1) );
343 i=(int)((b>>16)&0xF); j=(int)((b>>20)&0xF);
344 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
345 _mm_or_si128(_mm_slli_epi64(tt[j],4),
346 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
347 ) ,2) );
348 i=(int)((b>>24)&0xF); j=(int)((b>>28)&0xF);
349 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
350 _mm_or_si128(_mm_slli_epi64(tt[j],4),
351 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
352 ) ,3) );
353 i=(int)((b>>32)&0xF); j=(int)((b>>36)&0xF);
354 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
355 _mm_or_si128(_mm_slli_epi64(tt[j],4),
356 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
357 ) ,4) );
358 i=(int)((b>>40)&0xF); j=(int)((b>>44)&0xF);
359 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
360 _mm_or_si128(_mm_slli_epi64(tt[j],4),
361 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
362 ) ,5) );
363 i=(int)((b>>48)&0xF); j=(int)((b>>52)&0xF);
364 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
365 _mm_or_si128(_mm_slli_epi64(tt[j],4),
366 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
367 ) ,6) );
368 i=(int)((b>>56)&0xF); j=(int)(b>>60);
369 pp=_mm_xor_si128(pp, _mm_slli_si128(_mm_xor_si128(tt[i],
370 _mm_or_si128(_mm_slli_epi64(tt[j],4),
371 _mm_srli_epi64(_mm_slli_si128(tt[j],8), 60))
372 ) ,7) );
373
374 *r=((unsigned long long *)&pp)[0];
375 return ((unsigned long long *)&pp)[1];
376 }
377
378 #else
379
mr_mul2(mr_small a,mr_small b,mr_small * r)380 static mr_small mr_mul2(mr_small a,mr_small b,mr_small *r)
381 {
382 int k;
383 mr_small kb,t[16];
384 mr_small x,q,p;
385 mr_utype tb0;
386 #if MIRACL > 32
387 mr_utype tb1,tb2;
388 #endif
389
390 kb=b;
391
392 #if MIRACL <= 32
393 t[0]=0; /* small look up table */
394 t[3]=t[2]=a<<1; /* it can overflow.... */
395 t[1]=t[2]>>1;
396 t[3]^=t[1];
397
398 tb0=(mr_utype)(a&TOPBIT); /* remember top bit */
399 tb0>>=M1; /* all ones if top bit is one */
400 #else
401 t[0]=0; /* larger look-up table */
402 t[8]=a<<3;
403 t[4]=t[8]>>1;
404 t[2]=t[4]>>1;
405 t[1]=t[2]>>1;
406 t[3]=t[5]=t[7]=t[9]=t[11]=t[13]=t[15]=t[1];
407 t[3]^=t[2];
408 t[5]^=t[4];
409 t[9]^=t[8];
410 t[6]=t[3]<<1;
411 t[7]^=t[6];
412 t[10]=t[5]<<1;
413 t[11]^=t[10];
414 t[12]=t[6]<<1;
415 t[13]^=t[12];
416 t[14]=t[7]<<1;
417 t[15]^=t[14];
418
419 tb0=(a&TOPBIT); /* remember top bits */
420 tb0>>=M1; /* all bits one, if this bit is set in a */
421 tb1=(a&SECBIT)<<1;
422 tb1>>=M1;
423 tb2=(a&THDBIT)<<2;
424 tb2>>=M1;
425 #endif
426
427 #if MIRACL == 8
428 #define UNWOUNDM
429 p=q=t[b&3]; q>>=2;
430 x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
431 x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
432 x=t[(b>>6)]; q^=x; p^=(x<<6); q>>=2;
433 #endif
434
435 #if MIRACL == 16
436 #define UNWOUNDM
437 p=q=t[b&3]; q>>=2;
438 x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2;
439 x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2;
440 x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
441 x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
442 x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
443 x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
444 x=t[(b>>14)]; q^=x; p^=(x<<14); q>>=2;
445 #endif
446
447 #if MIRACL == 32
448 #define UNWOUNDM
449 p=q=t[b&3]; q>>=2;
450 x=t[(b>>2)&3]; q^=x; p^=(x<<2); q>>=2; /* 8 ASM 80386 instructions */
451 x=t[(b>>4)&3]; q^=x; p^=(x<<4); q>>=2; /* but only 4 ARM instructions! */
452 x=t[(b>>6)&3]; q^=x; p^=(x<<6); q>>=2;
453 x=t[(b>>8)&3]; q^=x; p^=(x<<8); q>>=2;
454 x=t[(b>>10)&3]; q^=x; p^=(x<<10); q>>=2;
455 x=t[(b>>12)&3]; q^=x; p^=(x<<12); q>>=2;
456 x=t[(b>>14)&3]; q^=x; p^=(x<<14); q>>=2;
457 x=t[(b>>16)&3]; q^=x; p^=(x<<16); q>>=2;
458 x=t[(b>>18)&3]; q^=x; p^=(x<<18); q>>=2;
459 x=t[(b>>20)&3]; q^=x; p^=(x<<20); q>>=2;
460 x=t[(b>>22)&3]; q^=x; p^=(x<<22); q>>=2;
461 x=t[(b>>24)&3]; q^=x; p^=(x<<24); q>>=2;
462 x=t[(b>>26)&3]; q^=x; p^=(x<<26); q>>=2;
463 x=t[(b>>28)&3]; q^=x; p^=(x<<28); q>>=2;
464 x=t[(b>>30)]; q^=x; p^=(x<<30); q>>=2;
465 #endif
466
467 #if MIRACL == 64
468 #define UNWOUNDM
469 p=q=t[b&0xf]; q>>=4;
470 x=t[(b>>4)&0xf]; q^=x; p^=(x<<4); q>>=4;
471 x=t[(b>>8)&0xf]; q^=x; p^=(x<<8); q>>=4;
472 x=t[(b>>12)&0xf]; q^=x; p^=(x<<12); q>>=4;
473 x=t[(b>>16)&0xf]; q^=x; p^=(x<<16); q>>=4;
474 x=t[(b>>20)&0xf]; q^=x; p^=(x<<20); q>>=4;
475 x=t[(b>>24)&0xf]; q^=x; p^=(x<<24); q>>=4;
476 x=t[(b>>28)&0xf]; q^=x; p^=(x<<28); q>>=4;
477 x=t[(b>>32)&0xf]; q^=x; p^=(x<<32); q>>=4;
478 x=t[(b>>36)&0xf]; q^=x; p^=(x<<36); q>>=4;
479 x=t[(b>>40)&0xf]; q^=x; p^=(x<<40); q>>=4;
480 x=t[(b>>44)&0xf]; q^=x; p^=(x<<44); q>>=4;
481 x=t[(b>>48)&0xf]; q^=x; p^=(x<<48); q>>=4;
482 x=t[(b>>52)&0xf]; q^=x; p^=(x<<52); q>>=4;
483 x=t[(b>>56)&0xf]; q^=x; p^=(x<<56); q>>=4;
484 x=t[(b>>60)]; q^=x; p^=(x<<60); q>>=4;
485
486 #endif
487
488 #ifndef UNWOUNDM
489
490 q=p=(mr_small)0;
491 for (k=0;k<MIRACL;k+=8)
492 {
493 q^=(t[b&3]);
494 b>>=2;
495 p>>=2;
496 p|=q<<M2;
497 q>>=2;
498
499 q^=(t[b&3]);
500 b>>=2;
501 p>>=2;
502 p|=q<<M2;
503 q>>=2;
504
505 q^=(t[b&3]);
506 b>>=2;
507 p>>=2;
508 p|=q<<M2;
509 q>>=2;
510
511 q^=(t[b&3]);
512 b>>=2;
513 p>>=2;
514 p|=q<<M2;
515 q>>=2;
516 }
517 #endif
518
519 #if MIRACL <= 32
520 p^=(tb0&(kb<<M1)); /* compensate for top bit */
521 q^=(tb0&(kb>>1)); /* don't break pipeline.. */
522 #else
523 p^=(tb0&(kb<<M1));
524 q^=(tb0&(kb>>1));
525 p^=(tb1&(kb<<M2));
526 q^=(tb1&(kb>>2));
527 p^=(tb2&(kb<<M3));
528 q^=(tb2&(kb>>3));
529 #endif
530
531 *r=p;
532 return q;
533 }
534
535 #endif
536
537 #endif
538
539 #endif
540
541 #endif
542
numbits(big x)543 static int numbits(big x)
544 { /* return degree of x */
545 mr_small *gx=x->w,bit=TOPBIT;
546 int m,k=x->len;
547 if (k==0) return 0;
548 m=k*MIRACL;
549 while (!(gx[k-1]&bit))
550 {
551 m--;
552 bit>>=1;
553 }
554 return m;
555 }
556
degree2(big x)557 int degree2(big x)
558 { /* returns -1 for x=0 */
559 return (numbits(x)-1);
560 }
561
562 /*
563 static int zerobits(big x)
564 {
565 int m,n,k;
566 mr_small *gx,lsb,bit=1;
567 k=x->len;
568 if (k==0) return (-1);
569 gx=x->w;
570 for (m=0;m<k;m++)
571 {
572 if (gx[m]==0) continue;
573 n=0;
574 lsb=gx[m];
575 while (!(bit&lsb))
576 {
577 n++;
578 bit<<=1;
579 }
580 break;
581 }
582 return (MIRACL*m+n);
583 }
584
585 static void shiftrightbits(big x,int m)
586 {
587 int i,k=x->len;
588 int w=m/MIRACL;
589 int b=m%MIRACL;
590 mr_small *gx=x->w;
591 if (k==0 || m==0) return;
592 if (w>0)
593 {
594 for (i=0;i<k-w;i++)
595 gx[i]=gx[i+w];
596 for (i=k-w;i<k;i++) gx[i]=0;
597 x->len-=w;
598 }
599 if (b!=0)
600 {
601 for (i=0;i<k-w-1;i++)
602 gx[i]=(gx[i]>>b)|(gx[i+1]<<(MIRACL-b));
603 gx[k-w-1]>>=b;
604 if (gx[k-w-1]==0) x->len--;
605 }
606 }
607 */
608
shiftleftbits(big x,int m)609 static void shiftleftbits(big x,int m)
610 {
611 int i,k=x->len;
612 mr_small j;
613 int w=m/MIRACL; /* words */
614 int b=m%MIRACL; /* bits */
615 mr_small *gx=x->w;
616 if (k==0 || m==0) return;
617 if (w>0)
618 {
619 for (i=k+w-1;i>=w;i--)
620 gx[i]=gx[i-w];
621 for (i=w-1;i>=0;i--) gx[i]=0;
622 x->len+=w;
623 }
624 /* time critical */
625 if (b!=0)
626 {
627 j=gx[k+w-1]>>(MIRACL-b);
628 if (j!=0)
629 {
630 x->len++;
631 gx[k+w]=j;
632 }
633 for (i=k+w-1;i>w;i--)
634 {
635 gx[i]=(gx[i]<<b)|(gx[i-1]>>(MIRACL-b));
636 }
637 gx[w]<<=b;
638 }
639 }
640
square2(big x,big w)641 static void square2(big x,big w)
642 { /* w=x*x where x can be NULL so be careful */
643 int i,j,n,m;
644 mr_small a,t,r,*gw;
645
646 static const mr_small look[16]=
647 {0,(mr_small)1<<M8,(mr_small)4<<M8,(mr_small)5<<M8,(mr_small)16<<M8,
648 (mr_small)17<<M8,(mr_small)20<<M8,(mr_small)21<<M8,(mr_small)64<<M8,
649 (mr_small)65<<M8,(mr_small)68<<M8,(mr_small)69<<M8,(mr_small)80<<M8,
650 (mr_small)81<<M8,(mr_small)84<<M8,(mr_small)85<<M8};
651
652 if (x!=w) copy(x,w);
653 n=w->len;
654 if (n==0) return;
655 m=n+n;
656 w->len=m;
657 gw=w->w;
658
659 for (i=n-1;i>=0;i--)
660 {
661 a=gw[i];
662
663 #if MIRACL == 8
664 #define UNWOUNDS
665 gw[i+i]=look[a&0xF];
666 gw[i+i+1]=look[(a>>4)];
667 #endif
668
669 #if MIRACL == 16
670 #define UNWOUNDS
671 gw[i+i]=(look[a&0xF]>>8)|look[(a>>4)&0xF];
672 gw[i+i+1]=(look[(a>>8)&0xF]>>8)|look[(a>>12)];
673 #endif
674
675 #if MIRACL == 32
676 #define UNWOUNDS
677 gw[i+i]=(look[a&0xF]>>24)|(look[(a>>4)&0xF]>>16)|(look[(a>>8)&0xF]>>8)|look[(a>>12)&0xF];
678 gw[i+i+1]=(look[(a>>16)&0xF]>>24)|(look[(a>>20)&0xF]>>16)|(look[(a>>24)&0xF]>>8)|look[(a>>28)];
679 #endif
680
681 #ifndef UNWOUNDS
682
683 r=0;
684 for (j=0;j<MIRACL/2;j+=4)
685 {
686 t=look[a&0xF];
687 a>>=4;
688 r>>=8;
689 r|=t;
690 }
691 gw[i+i]=r; r=0;
692
693 for (j=0;j<MIRACL/2;j+=4)
694 {
695 t=look[a&0xF];
696 a>>=4;
697 r>>=8;
698 r|=t;
699 }
700 gw[i+i+1]=r;
701
702 #endif
703
704 }
705 if (gw[m-1]==0)
706 {
707 w->len--;
708 if (gw[m-2]==0)
709 mr_lzero(w);
710 }
711 }
712
713 /* Use karatsuba to multiply two polynomials with coefficients in GF(2^m) */
714
715 #ifndef MR_STATIC
716
karmul2_poly(_MIPD_ int n,big * t,big * x,big * y,big * z)717 void karmul2_poly(_MIPD_ int n,big *t,big *x,big *y,big *z)
718 {
719 int m,nd2,nd,md,md2;
720 if (n==1)
721 { /* finished */
722 modmult2(_MIPP_ *x,*y,*z);
723 zero(z[1]);
724 return;
725 }
726 if (n==2)
727 { /* in-line 2x2 */
728 modmult2(_MIPP_ x[0],y[0],z[0]);
729 modmult2(_MIPP_ x[1],y[1],z[2]);
730 add2(x[0],x[1],t[0]);
731 add2(y[0],y[1],t[1]);
732 modmult2(_MIPP_ t[0],t[1],z[1]);
733 add2(z[1],z[0],z[1]);
734 add2(z[1],z[2],z[1]);
735 zero(z[3]);
736 return;
737 }
738
739 if (n==3)
740 {
741 modmult2(_MIPP_ x[0],y[0],z[0]);
742 modmult2(_MIPP_ x[1],y[1],z[2]);
743 modmult2(_MIPP_ x[2],y[2],z[4]);
744 add2(x[0],x[1],t[0]);
745 add2(y[0],y[1],t[1]);
746 modmult2(_MIPP_ t[0],t[1],z[1]);
747 add2(z[1],z[0],z[1]);
748 add2(z[1],z[2],z[1]);
749 add2(x[1],x[2],t[0]);
750 add2(y[1],y[2],t[1]);
751 modmult2(_MIPP_ t[0],t[1],z[3]);
752 add2(z[3],z[2],z[3]);
753 add2(z[3],z[4],z[3]);
754 add2(x[0],x[2],t[0]);
755 add2(y[0],y[2],t[1]);
756 modmult2(_MIPP_ t[0],t[1],t[0]);
757 add2(z[2],t[0],z[2]);
758 add2(z[2],z[0],z[2]);
759 add2(z[2],z[4],z[2]);
760 zero(z[5]);
761 return;
762 }
763
764 if (n%2==0)
765 {
766 md=nd=n;
767 md2=nd2=n/2;
768 }
769 else
770 {
771 nd=n+1;
772 md=n-1;
773 nd2=nd/2; md2=md/2;
774 }
775
776 for (m=0;m<nd2;m++)
777 {
778 copy(x[m],z[m]);
779 copy(y[m],z[nd2+m]);
780 }
781 for (m=0;m<md2;m++)
782 {
783 add2(z[m],x[nd2+m],z[m]);
784 add2(z[nd2+m],y[nd2+m],z[nd2+m]);
785 }
786
787 karmul2_poly(_MIPP_ nd2,&t[nd],z,&z[nd2],t);
788
789 karmul2_poly(_MIPP_ nd2,&t[nd],x,y,z);
790
791 for (m=0;m<nd;m++) add2(t[m],z[m],t[m]);
792
793 karmul2_poly(_MIPP_ md2,&t[nd],&x[nd2],&y[nd2],&z[nd]);
794
795 for (m=0;m<md;m++) add2(t[m],z[nd+m],t[m]);
796 for (m=0;m<nd;m++) add2(z[nd2+m],t[m],z[nd2+m]);
797 }
798
karmul2_poly_upper(_MIPD_ int n,big * t,big * x,big * y,big * z)799 void karmul2_poly_upper(_MIPD_ int n,big *t,big *x,big *y,big *z)
800 { /* n is large and even, and upper half of z is known already */
801 int m,nd2,nd;
802 nd2=n/2; nd=n;
803
804 for (m=0;m<nd2;m++)
805 {
806 add2(x[m],x[nd2+m],z[m]);
807 add2(y[m],y[nd2+m],z[nd2+m]);
808 }
809
810 karmul2_poly(_MIPP_ nd2,&t[nd],z,&z[nd2],t);
811
812 karmul2_poly(_MIPP_ nd2,&t[nd],x,y,z); /* only 2 karmuls needed! */
813
814 for (m=0;m<nd;m++) add2(t[m],z[m],t[m]);
815
816 for (m=0;m<nd2;m++)
817 {
818 add2(z[nd+m],z[nd+nd2+m],z[nd+m]);
819 add2(z[nd+m],t[nd2+m],z[nd+m]);
820 }
821
822 for (m=0;m<nd;m++)
823 {
824 add2(t[m],z[nd+m],t[m]);
825 add2(z[nd2+m],t[m],z[nd2+m]);
826 }
827 }
828
829 #endif
830
831 /* Some in-line karatsuba down at the bottom... */
832
833 #ifndef MR_COMBA2
834
mr_bottom2(mr_small * x,mr_small * y,mr_small * z)835 static void mr_bottom2(mr_small *x,mr_small *y,mr_small *z)
836 {
837 mr_small q0,r0,q1,r1,q2,r2;
838
839 q0=mr_mul2(x[0],y[0],&r0);
840 q1=mr_mul2(x[1],y[1],&r1);
841 q2=mr_mul2((mr_small)(x[0]^x[1]),(mr_small)(y[0]^y[1]),&r2);
842
843 z[0]=r0;
844 z[1]=q0^r1^r0^r2;
845 z[2]=q0^r1^q1^q2;
846 z[3]=q1;
847 }
848
mr_bottom3(mr_small * x,mr_small * y,mr_small * z)849 static void mr_bottom3(mr_small *x,mr_small *y,mr_small *z)
850 { /* just 6 mr_muls... */
851 mr_small q0,r0,q1,r1,q2,r2;
852 mr_small a0,b0,a1,b1,a2,b2;
853
854 q0=mr_mul2(x[0],y[0],&r0);
855 q1=mr_mul2(x[1],y[1],&r1);
856 q2=mr_mul2(x[2],y[2],&r2);
857
858 a0=mr_mul2((mr_small)(x[0]^x[1]),(mr_small)(y[0]^y[1]),&b0);
859 a1=mr_mul2((mr_small)(x[1]^x[2]),(mr_small)(y[1]^y[2]),&b1);
860 a2=mr_mul2((mr_small)(x[0]^x[2]),(mr_small)(y[0]^y[2]),&b2);
861
862 b0^=r0^r1;
863 a0^=q0^q1;
864 b1^=r1^r2;
865 a1^=q1^q2;
866 b2^=r0^r2;
867 a2^=q0^q2;
868
869 z[0]=r0;
870 z[1]=q0^b0;
871 z[2]=r1^a0^b2;
872 z[3]=q1^b1^a2;
873 z[4]=r2^a1;
874 z[5]=q2;
875 }
876
mr_bottom4(mr_small * x,mr_small * y,mr_small * z)877 static void mr_bottom4(mr_small *x,mr_small *y,mr_small *z)
878 { /* unwound 4x4 karatsuba multiplication - only 9 muls */
879 mr_small q0,r0,q1,r1,q2,r2,tx,ty;
880 mr_small t0,t1,t2,t3;
881 mr_small z0,z1,z2,z3,z4,z5,z6,z7;
882 mr_small x0,x1,x2,x3,y0,y1,y2,y3;
883
884 x0=x[0]; x1=x[1]; x2=x[2]; x3=x[3];
885 y0=y[0]; y1=y[1]; y2=y[2]; y3=y[3];
886
887 q0=mr_mul2(x0,y0,&r0);
888 q1=mr_mul2(x1,y1,&r1);
889
890 tx=x0^x1;
891 ty=y0^y1;
892 q2=mr_mul2(tx,ty,&r2);
893
894 z0=r0;
895 q0^=r1;
896 z1=q0^r0^r2;
897 z2=q0^q1^q2;
898 z3=q1;
899
900 q0=mr_mul2(x2,y2,&r0);
901
902 q1=mr_mul2(x3,y3,&r1);
903
904 tx=x2^x3;
905 ty=y2^y3;
906 q2=mr_mul2(tx,ty,&r2);
907
908 z4=r0;
909 q0^=r1;
910 z5=q0^r0^r2;
911 z6=q0^q1^q2;
912 z7=q1;
913
914 x2^=x0;
915 y2^=y0;
916 q0=mr_mul2(x2,y2,&r0);
917
918 x3^=x1;
919 y3^=y1;
920 q1=mr_mul2(x3,y3,&r1);
921
922 x2^=x3;
923 y2^=y3;
924 q2=mr_mul2(x2,y2,&r2);
925
926 q0^=r1;
927 t0=z0^z4^r0;
928 t1=z1^z5^q0^r0^r2;
929 t2=z2^z6^q0^q1^q2;
930 t3=z3^z7^q1;
931
932 z2^=t0;
933 z3^=t1;
934 z4^=t2;
935 z5^=t3;
936
937 z[0]=z0;
938 z[1]=z1;
939 z[2]=z2;
940 z[3]=z3;
941 z[4]=z4;
942 z[5]=z5;
943 z[6]=z6;
944 z[7]=z7;
945 }
946
mr_bottom5(mr_small * x,mr_small * y,mr_small * z)947 static void mr_bottom5(mr_small *x,mr_small *y,mr_small *z)
948 { /* Montgomery's 5x5 formula. Requires 13 mr_muls.... */
949 mr_small u0,v0,u1,v1,u2,v2,s0,t0,q,r,t;
950 mr_small z1,z2,z3,z4,z5,z6,z7,z8;
951 mr_small x0,x1,x2,x3,x4,y0,y1,y2,y3,y4;
952
953 x0=x[0]; x1=x[1]; x2=x[2]; x3=x[3]; x4=x[4];
954 y0=y[0]; y1=y[1]; y2=y[2]; y3=y[3]; y4=y[4];
955
956 q=mr_mul2(x0,y0,&r);
957 t=q^r;
958 z[0]=r;
959 z1=t;
960 z2=t;
961 z3=q;
962 z4=r;
963 z5=t;
964 z6=t;
965 z7=q;
966 q=mr_mul2(x1,y1,&r);
967 z1^=r;
968 z2^=q;
969 z4^=r;
970 z5^=q;
971 q=mr_mul2(x3,y3,&r);
972 z4^=r;
973 z5^=q;
974 z7^=r;
975 z8=q;
976 q=mr_mul2(x4,y4,&r);
977 t=q^r;
978 z2^=r;
979 z3^=t;
980 z4^=t;
981 z5^=q;
982 z6^=r;
983 z7^=t;
984 z8^=t;
985 z[9]=q;
986
987 u0=x0^x4; /* u0=a0+a4 */
988 v0=y0^y4; /* v0=b0+b4 */
989 q=mr_mul2(u0,v0,&r);
990 t=q^r;
991 z2^=r;
992 z3^=t;
993 z4^=q;
994 z5^=r;
995 z6^=t;
996 z7^=q;
997
998 u1=x0^x1; /* u1=a0+a1 */
999 v1=y0^y1; /* v1=b0+b1 */
1000 q=mr_mul2(u1,v1,&r);
1001 t=q^r;
1002 z1^=r;
1003 z2^=t;
1004 z3^=q;
1005 z4^=r;
1006 z5^=t;
1007 z6^=q;
1008
1009 u2=x3^x4; /* u2=a3+a4 */
1010 v2=y3^y4; /* v2=b3+b4 */
1011 q=mr_mul2(u2,v2,&r);
1012 t=q^r;
1013 z3^=r;
1014 z4^=t;
1015 z5^=q;
1016 z6^=r;
1017 z7^=t;
1018 z8^=q;
1019
1020 /*u=u1^u2; u=a0+a1+a3+a4 */
1021 /*v=v1^v2; v=b0+b1+b3+b4 */
1022 q=mr_mul2(u1^u2,v1^v2,&r);
1023 z3^=r;
1024 z4^=q;
1025 z5^=r;
1026 z6^=q;
1027
1028 s0=u1^x2; /* s0=a0+a1+a2 */
1029 t0=v1^y2; /* t0=b0+b1+b2 */
1030 u2^=x2; /* u2=a2+a3+a4 */
1031 v2^=y2; /* v2=b2+b3+b4 */
1032 u1^=u2; /* u1=a0+a1+a2+a3+a4 */
1033 v1^=v2; /* v1=b0+b1+b2+b3+b4 */
1034 q=mr_mul2(u1,v1,&r);
1035 t=q^r;
1036 z3^=r;
1037 z4^=t;
1038 z5^=t;
1039 z6^=q;
1040
1041 u2^=x0; /* s1=a0+a2+a3+a4 */
1042 v2^=y0; /* t1=b0+b2+b3+b4 */
1043 q=mr_mul2(u2,v2,&r);
1044 z3^=r;
1045 z4^=q;
1046 z6^=r;
1047 z7^=q;
1048
1049 s0^=x4; /* s0=a0+a1+a2+a4 */
1050 t0^=y4; /* t0=b0+b1+b2+b4 */
1051 q=mr_mul2(s0,t0,&r);
1052 z2^=r;
1053 z3^=q;
1054 z5^=r;
1055 z6^=q;
1056
1057 u2^=x4; /* u2=a0+a2+a3 */
1058 v2^=y4; /* v2=b0+b2+b3 */
1059 q=mr_mul2(u2,v2,&r);
1060 z4^=r;
1061 z5^=q;
1062 z6^=r;
1063 z7^=q;
1064
1065 s0^=x0; /* s0=a1+a2+a4 */
1066 t0^=y0; /* t0=b1+b2+b4 */
1067 q=mr_mul2(s0,t0,&r);
1068 z2^=r;
1069 z3^=q;
1070 z4^=r;
1071 z5^=q;
1072
1073 z[1]=z1;
1074 z[2]=z2;
1075 z[3]=z3;
1076 z[4]=z4;
1077 z[5]=z5;
1078 z[6]=z6;
1079 z[7]=z7;
1080 z[8]=z8;
1081 }
1082
1083
karmul2(int n,mr_small * t,mr_small * x,mr_small * y,mr_small * z)1084 void karmul2(int n,mr_small *t,mr_small *x,mr_small *y,mr_small *z)
1085 { /* Karatsuba multiplication - note that n can be odd or even */
1086 int m,nd2,nd,md,md2;
1087
1088 if (n<=5)
1089 {
1090 if (n==1)
1091 {
1092 z[1]=mr_mul2(x[0],y[0],&z[0]);
1093 return;
1094 }
1095 if (n==2)
1096 {
1097 mr_bottom2(x,y,z);
1098 return;
1099 }
1100 if (n==3)
1101 {
1102 mr_bottom3(x,y,z);
1103 return;
1104 }
1105 if (n==4)
1106 {
1107 mr_bottom4(x,y,z);
1108 return;
1109 }
1110 if (n==5)
1111 {
1112 mr_bottom5(x,y,z);
1113 return;
1114 }
1115 }
1116 if (n%2==0)
1117 {
1118 md=nd=n;
1119 md2=nd2=n/2;
1120 }
1121 else
1122 {
1123 nd=n+1;
1124 md=n-1;
1125 nd2=nd/2; md2=md/2;
1126 }
1127
1128 for (m=0;m<nd2;m++)
1129 {
1130 z[m]=x[m];
1131 z[nd2+m]=y[m];
1132 }
1133 for (m=0;m<md2;m++)
1134 {
1135 z[m]^=x[nd2+m];
1136 z[nd2+m]^=y[nd2+m];
1137 }
1138
1139 karmul2(nd2,&t[nd],z,&z[nd2],t);
1140 karmul2(nd2,&t[nd],x,y,z);
1141
1142 for (m=0;m<nd;m++) t[m]^=z[m];
1143
1144 karmul2(md2,&t[nd],&x[nd2],&y[nd2],&z[nd]);
1145
1146 for (m=0;m<md;m++) t[m]^=z[nd+m];
1147 for (m=0;m<nd;m++) z[nd2+m]^=t[m];
1148 }
1149
1150 #endif
1151
1152 /* this is time-critical, so use karatsuba here, since addition is cheap *
1153 * and easy (no carries to worry about...) */
1154
1155 /* #define CLAIRE */
1156
multiply2(_MIPD_ big x,big y,big w)1157 void multiply2(_MIPD_ big x,big y,big w)
1158 {
1159
1160 #ifdef MR_OS_THREADS
1161 miracl *mr_mip=get_mip();
1162 #endif
1163
1164 #ifdef MR_COMBA2
1165 comba_mult2(_MIPP_ x,y,w);
1166 return;
1167 #else
1168 int i,j,xl,yl,ml;
1169 #ifdef CLAIRE
1170 int d;
1171 mr_small hi,lo;
1172 #endif
1173 mr_small p,q;
1174
1175 big w0=mr_mip->w0;
1176
1177 if (x==NULL || y==NULL)
1178 {
1179 zero(w);
1180 return;
1181 }
1182 if (x->len==0 || y->len==0)
1183 {
1184 zero(w);
1185 return;
1186 }
1187
1188 xl=x->len;
1189 yl=y->len;
1190 zero(w0);
1191
1192 #ifdef CLAIRE
1193
1194 /* Comba method */
1195
1196 w0->len=xl+yl;
1197 d=1+mr_mip->M/MIRACL;
1198 hi=lo=0;
1199 for (i=0;i<d;i++)
1200 {
1201 for (j=0;j<=i;j++)
1202 {
1203 q=mr_mul2(x->w[j],y->w[i-j],&p);
1204 hi^=q; lo^=p;
1205 }
1206 w0->w[i]=lo; lo=hi; hi=0;
1207 }
1208 for (i=d;i<2*d-1;i++)
1209 {
1210 for (j=i-d+1;j<d;j++)
1211 {
1212 q=mr_mul2(x->w[j],y->w[i-j],&p);
1213 hi^=q; lo^=p;
1214 }
1215 w0->w[i]=lo; lo=hi; hi=0;
1216 }
1217 w0->w[2*d-1]=lo;
1218 mr_lzero(w0);
1219 copy(w0,w);
1220
1221 #else
1222
1223 /* recommended method as mr_mul2 is so slow... */
1224
1225 if (xl>=MR_KARATSUBA && yl>=MR_KARATSUBA)
1226 {
1227 if (xl>yl) ml=xl;
1228 else ml=yl;
1229
1230 karmul2(ml,mr_mip->w7->w,x->w,y->w,w0->w);
1231
1232 mr_mip->w7->len=w0->len=2*ml+1;
1233 mr_lzero(w0);
1234 mr_lzero(mr_mip->w7);
1235 copy(w0,w);
1236 return;
1237 }
1238
1239 w0->len=xl+yl;
1240 for (i=0;i<xl;i++)
1241 {
1242 for (j=0;j<yl;j++)
1243 {
1244 q=mr_mul2(x->w[i],y->w[j],&p);
1245 w0->w[i+j]^=p;
1246 w0->w[i+j+1]^=q;
1247 }
1248 }
1249 mr_lzero(w0);
1250 copy(w0,w);
1251 #endif
1252 #endif
1253 }
1254
add2(big x,big y,big z)1255 void add2(big x,big y,big z)
1256 { /* XOR x and y */
1257 int i,lx,ly,lz,lm;
1258 mr_small *gx,*gy,*gz;
1259
1260 if (x==y)
1261 {
1262 zero(z);
1263 return;
1264 }
1265 if (y==NULL)
1266 {
1267 copy(x,z);
1268 return;
1269 }
1270 else if (x==NULL)
1271 {
1272 copy(y,z);
1273 return;
1274 }
1275
1276 if (x==z)
1277 {
1278 gy=y->w; gz=z->w;
1279 ly=y->len; lz=z->len;
1280 lm=lz; if (ly>lz) lm=ly;
1281 for (i=0;i<lm;i++)
1282 gz[i]^=gy[i];
1283 z->len=lm;
1284 if (gz[lm-1]==0) mr_lzero(z);
1285 }
1286 else
1287 {
1288 gx=x->w; gy=y->w; gz=z->w;
1289 lx=x->len; ly=y->len; lz=z->len;
1290 lm=lx; if (ly>lx) lm=ly;
1291
1292 for (i=0;i<lm;i++)
1293 gz[i]=gx[i]^gy[i];
1294 for (i=lm;i<lz;i++)
1295 gz[i]=0;
1296 z->len=lm;
1297 if (gz[lm-1]==0) mr_lzero(z);
1298 }
1299 }
1300
remain2(_MIPD_ big y,big x)1301 static void remain2(_MIPD_ big y,big x)
1302 { /* generic "remainder" program. x%=y */
1303 #ifdef MR_OS_THREADS
1304 miracl *mr_mip=get_mip();
1305 #endif
1306 int my=numbits(y);
1307 int mx=numbits(x);
1308 while (mx>=my)
1309 {
1310 copy(y,mr_mip->w7);
1311 shiftleftbits(mr_mip->w7,mx-my);
1312 add2(x,mr_mip->w7,x);
1313 mx=numbits(x);
1314 }
1315 return;
1316 }
1317
gcd2(_MIPD_ big x,big y,big g)1318 void gcd2(_MIPD_ big x,big y,big g)
1319 {
1320 #ifdef MR_OS_THREADS
1321 miracl *mr_mip=get_mip();
1322 #endif
1323 if (size(y)==0)
1324 {
1325 copy(x,g);
1326 return;
1327 }
1328 copy(x,mr_mip->w1);
1329 copy(y,mr_mip->w2);
1330 forever
1331 {
1332 remain2(_MIPP_ mr_mip->w2,mr_mip->w1);
1333 if (size(mr_mip->w1)==0) break;
1334 copy(mr_mip->w1,mr_mip->w3);
1335 copy(mr_mip->w2,mr_mip->w1);
1336 copy(mr_mip->w3,mr_mip->w2);
1337 }
1338 copy(mr_mip->w2,g);
1339 }
1340
1341
1342 /* See "Elliptic Curves in Cryptography", Blake, Seroussi & Smart,
1343 Cambridge University Press, 1999, page 20, for this fast reduction
1344 routine - algorithm II.9 */
1345
reduce2(_MIPD_ big y,big x)1346 void reduce2(_MIPD_ big y,big x)
1347 { /* reduction wrt the trinomial or pentanomial modulus *
1348 * Note that this is linear O(n), and thus not time critical */
1349 int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4,i;
1350 int M,A,B,C;
1351 int xl;
1352 mr_small top,*gx,w;
1353
1354 #ifdef MR_OS_THREADS
1355 miracl *mr_mip=get_mip();
1356 #endif
1357
1358 if (x!=y) copy(y,x);
1359 xl=x->len;
1360 gx=x->w;
1361
1362 M=mr_mip->M;
1363 A=mr_mip->AA;
1364 if (A==0)
1365 {
1366 mr_berror(_MIPP_ MR_ERR_NO_BASIS);
1367 return;
1368 }
1369 B=mr_mip->BB;
1370 C=mr_mip->CC;
1371
1372
1373 /* If optimizing agressively it makes sense to make this code specific to a particular field
1374 For example code like this can be optimized for the case
1375 m=163. Note that the general purpose code involves lots of branches - these cause breaks in
1376 the pipeline and they are slow. Further loop unrolling would be even faster...
1377
1378 Version 5.10 - optimal code for 32-bit processors and for some NIST curves added
1379 Version 5.22 - some code for a 16-bit processor..
1380
1381 Version 5.23 - Use findbase.cpp to find "best" irreducible polynomial
1382 Version 5.23 - Use utility irp.cpp to automatically generate optimal code for insertion here
1383 */
1384
1385 #if MIRACL == 8
1386
1387 if (M==163 && A==7 && B==6 && C==3)
1388 {
1389 for (i=xl-1;i>=21;i--)
1390 {
1391 w=gx[i]; gx[i]=0;
1392 gx[i-19]^=(w>>4)^(w>>5);
1393 gx[i-20]^=(w>>3)^(w<<4)^(w<<3)^w;
1394 gx[i-21]^=(w<<5);
1395 } /* XORs= 7 shifts= 6 */
1396 top=gx[20]>>3;
1397 gx[0]^=top;
1398 top<<=3;
1399 gx[0]^=(top<<4)^(top<<3)^top;
1400 gx[1]^=(top>>4)^(top>>5);
1401 gx[20]^=top;
1402 x->len=21;
1403 if (gx[20]==0) mr_lzero(x);
1404 return;
1405 }
1406
1407 if (M==271 && A==201)
1408 {
1409 for (i=xl-1;i>=34;i--)
1410 {
1411 w=gx[i]; gx[i]=0;
1412 gx[i-8]^=(w>>6);
1413 gx[i-9]^=(w<<2);
1414 gx[i-33]^=(w>>7);
1415 gx[i-34]^=(w<<1);
1416 } /* XORs= 4 shifts= 4 */
1417 top=gx[33]>>7;
1418 gx[0]^=top;
1419 top<<=7;
1420 gx[24]^=(top<<2);
1421 gx[25]^=(top>>6);
1422 gx[33]^=top;
1423 x->len=34;
1424 if (gx[33]==0) mr_lzero(x);
1425 return;
1426 }
1427
1428 if (M==271 && A==207 && B==175 && C==111)
1429 {
1430 for (i=xl-1;i>=34;i--)
1431 {
1432 w=gx[i]; gx[i]=0;
1433 gx[i-8]^=w;
1434 gx[i-12]^=w;
1435 gx[i-20]^=w;
1436 gx[i-33]^=(w>>7);
1437 gx[i-34]^=(w<<1);
1438 } /* XORs= 5 shifts= 2 */
1439 top=gx[33]>>7;
1440 gx[0]^=top;
1441 top<<=7;
1442 gx[13]^=top;
1443 gx[21]^=top;
1444 gx[25]^=top;
1445 gx[33]^=top;
1446 x->len=34;
1447 if (gx[33]==0) mr_lzero(x);
1448 return;
1449 }
1450
1451 #endif
1452
1453 #if MIRACL == 16
1454
1455 if (M==163 && A==7 && B==6 && C==3)
1456 {
1457 for (i=xl-1;i>=11;i--)
1458 {
1459 w=gx[i]; gx[i]=0;
1460 gx[i-10]^=(w>>3)^(w<<3)^(w<<4)^w;
1461 gx[i-11]^=(w<<13);
1462 gx[i-9]^=(w>>12)^(w>>13);
1463 }
1464 top=gx[10]>>3;
1465 gx[0]^=top;
1466 top<<=3;
1467
1468 gx[1]^=(top>>12)^(top>>13);
1469 gx[0]^=(top<<4)^(top<<3)^top;
1470
1471 gx[10]^=top;
1472 x->len=11;
1473 if (gx[10]==0) mr_lzero(x);
1474
1475 return;
1476 }
1477
1478 if (M==271 && A==201 && B==0)
1479 {
1480 for (i=xl-1;i>=17;i--)
1481 {
1482 w=gx[i]; gx[i]=0;
1483 gx[i-17]^=(w<<1);
1484 gx[i-16]^=(w>>15);
1485 gx[i-5]^=(w<<10);
1486 gx[i-4]^=(w>>6);
1487 }
1488 top=gx[16]>>15;
1489 gx[0]^=top;
1490 top<<=15;
1491 gx[12]^=(top>>6);
1492 gx[11]^=(top<<10);
1493
1494 gx[16]^=top;
1495 x->len=17;
1496 if (gx[16]==0) mr_lzero(x);
1497
1498 return;
1499 }
1500
1501 if (M==271 && A==207 && B==175 && C==111)
1502 {
1503 for (i=xl-1;i>=17;i--)
1504 {
1505 w=gx[i]; gx[i]=0;
1506 gx[i-4]^=w;
1507 gx[i-6]^=w;
1508 gx[i-10]^=w;
1509 gx[i-16]^=(w>>15);
1510 gx[i-17]^=(w<<1);
1511 } /* XORs= 5 shifts= 2 */
1512 top=gx[16]>>15;
1513 gx[0]^=top;
1514 top<<=15;
1515 gx[6]^=top;
1516 gx[10]^=top;
1517 gx[12]^=top;
1518 gx[16]^=top;
1519 x->len=17;
1520 if (gx[16]==0) mr_lzero(x);
1521 return;
1522 }
1523
1524 #endif
1525
1526 #if MIRACL == 32
1527
1528 if (M==127 && A==63)
1529 {
1530 for (i=xl-1;i>=4;i--)
1531 {
1532 w=gx[i]; gx[i]=0;
1533 gx[i-2]^=w;
1534 gx[i-3]^=(w>>31);
1535 gx[i-4]^=(w<<1);
1536 } /* XORs= 3 shifts= 2 */
1537 top=gx[3]>>31; gx[0]^=top; top<<=31;
1538 gx[1]^=top;
1539 gx[3]^=top;
1540 x->len=4;
1541 if (gx[3]==0) mr_lzero(x);
1542 return;
1543 }
1544
1545 if (M==163 && A==7 && B==6 && C==3)
1546 {
1547 for (i=xl-1;i>=6;i--)
1548 {
1549 w=gx[i]; gx[i]=0;
1550 gx[i-5]^=((w>>3)^(w<<4)^(w<<3)^w);
1551 gx[i-6]^=(w<<29);
1552 gx[i-4]^=((w>>28)^(w>>29));
1553 }
1554 top=gx[5]>>3;
1555
1556 gx[0]^=top;
1557 top<<=3;
1558 gx[1]^=(top>>28)^(top>>29);
1559 gx[0]^=top^(top<<4)^(top<<3);
1560
1561 gx[5]^=top;
1562
1563 x->len=6;
1564 if (gx[5]==0) mr_lzero(x);
1565
1566 return;
1567 }
1568
1569 if (M==163 && A==99 && B==97 && C==3)
1570 {
1571 for (i=xl-1;i>=6;i--)
1572 {
1573 w=gx[i]; gx[i]=0;
1574 gx[i-2]^=w^(w>>2);
1575 gx[i-3]^=(w<<30);
1576 gx[i-5]^=(w>>3)^w;
1577 gx[i-6]^=(w<<29);
1578 }
1579 top=gx[5]>>3;
1580 gx[0]^=top;
1581 top<<=3;
1582 gx[0]^=top;
1583 gx[2]^=(top<<30);
1584 gx[3]^=top^(top>>2);
1585 gx[5]^=top;
1586 x->len=6;
1587 if (gx[5]==0) mr_lzero(x);
1588 return;
1589 }
1590
1591 if (M==233 && A==74 && B==0)
1592 {
1593 for (i=xl-1;i>=8;i--)
1594 {
1595 w=gx[i]; gx[i]=0;
1596 gx[i-8]^=(w<<23);
1597 gx[i-7]^=(w>>9);
1598 gx[i-5]^=(w<<1);
1599 gx[i-4]^=(w>>31);
1600 }
1601 top=gx[7]>>9;
1602 gx[0]^=top;
1603 gx[2]^=(top<<10);
1604 gx[3]^=(top>>22);
1605 gx[7]&=0x1FF;
1606
1607 x->len=8;
1608 if (gx[7]==0) mr_lzero(x);
1609 return;
1610 }
1611
1612 if (M==233 && A==159 && B==0)
1613 {
1614 for (i=xl-1;i>=8;i--)
1615 {
1616 w=gx[i]; gx[i]=0;
1617 gx[i-2]^=(w>>10);
1618 gx[i-3]^=(w<<22);
1619 gx[i-7]^=(w>>9);
1620 gx[i-8]^=(w<<23);
1621 }
1622 top=gx[7]>>9;
1623 gx[0]^=top;
1624 top<<=9;
1625 gx[4]^=(top<<22);
1626 gx[5]^=(top>>10);
1627 gx[7]^=top;
1628 x->len=8;
1629 if (gx[7]==0) mr_lzero(x);
1630 return;
1631 }
1632
1633 if (M==233 && A==201 && B==105 && C==9)
1634 {
1635 for (i=xl-1;i>=8;i--)
1636 {
1637 w=gx[i]; gx[i]=0;
1638 gx[i-1]^=w;
1639 gx[i-4]^=w;
1640 gx[i-7]^=(w>>9)^w;
1641 gx[i-8]^=(w<<23);
1642 }
1643 top=gx[7]>>9;
1644 gx[0]^=top;
1645 top<<=9;
1646 gx[0]^=top;
1647 gx[3]^=top;
1648 gx[6]^=top;
1649 gx[7]^=top;
1650 x->len=8;
1651 if (gx[7]==0) mr_lzero(x);
1652
1653 return;
1654 }
1655
1656 if (M==103 && A==9 && B==0)
1657 {
1658 for (i=xl-1;i>=4;i--)
1659 {
1660 w=gx[i]; gx[i]=0;
1661 gx[i-3]^=((w>>7)^(w<<2));
1662 gx[i-4]^=(w<<25);
1663 gx[i-2]^=(w>>30);
1664 }
1665 top=gx[3]>>7;
1666 gx[0]^=top;
1667 top<<=7;
1668 gx[1]^=(top>>30);
1669 gx[0]^=(top<<2);
1670 gx[3]^=top;
1671 x->len=4;
1672 if (gx[3]==0) mr_lzero(x);
1673
1674 return;
1675 }
1676
1677 if (M==283 && A==12 && B==7 && C==5)
1678 {
1679 for (i=xl-1;i>=9;i--)
1680 {
1681 w=gx[i]; gx[i]=0;
1682 gx[i-9]^=(w<<5)^(w<<10)^(w<<12)^(w<<17);
1683 gx[i-8]^=(w>>27)^(w>>22)^(w>>20)^(w>>15);
1684 }
1685
1686 top=gx[8]>>27;
1687 gx[0]^=top^(top<<5)^(top<<7)^(top<<12);
1688 gx[8]&=0x7FFFFFF;
1689
1690 x->len=9;
1691 if (gx[8]==0) mr_lzero(x);
1692 return;
1693 }
1694
1695 if (M==283 && A==249 && B==219 && C==27)
1696 {
1697 for (i=xl-1;i>=9;i--)
1698 {
1699 w=gx[i]; gx[i]=0;
1700 gx[i-1]^=(w>>2);
1701 gx[i-2]^=(w<<30)^w;
1702 gx[i-8]^=(w>>27)^w;
1703 gx[i-9]^=(w<<5);
1704 } /* XORs= 6 shifts= 4 */
1705 top=gx[8]>>27;
1706 gx[0]^=top;
1707 top<<=27;
1708 gx[0]^=top;
1709 gx[6]^=(top<<30)^top;
1710 gx[7]^=(top>>2);
1711 gx[8]^=top;
1712 x->len=9;
1713 if (gx[8]==0) mr_lzero(x);
1714 return;
1715 }
1716
1717 if (M==313 && A==121 && B==0)
1718 {
1719 for (i=xl-1;i>=10;i--)
1720 {
1721 w=gx[i]; gx[i]=0;
1722 gx[i-6]^=w;
1723 gx[i-9]^=(w>>25);
1724 gx[i-10]^=(w<<7);
1725 }
1726 top=gx[9]>>25;
1727 gx[0]^=top;
1728 top<<=25;
1729 gx[3]^=top;
1730 gx[9]^=top;
1731 x->len=10;
1732 if (gx[9]==0) mr_lzero(x);
1733 return;
1734 }
1735
1736 if (M==379 && A==253 && B==251 && C==59)
1737 {
1738 for (i=xl-1;i>=12;i--)
1739 {
1740 w=gx[i]; gx[i]=0;
1741 gx[i-3]^=(w>>30);
1742 gx[i-4]^=(w<<2)^w;
1743 gx[i-10]^=w;
1744 gx[i-11]^=(w>>27);
1745 gx[i-12]^=(w<<5);
1746 } /* XORs= 6 shifts= 4 */
1747 top=gx[11]>>27; gx[0]^=top; top<<=27;
1748 gx[1]^=top;
1749 gx[7]^=(top<<2)^top;
1750 gx[8]^=(top>>30);
1751 gx[11]^=top;
1752 x->len=12;
1753 if (gx[11]==0) mr_lzero(x);
1754 return;
1755 }
1756
1757 if (M==571 && A==10 && B==5 && C==2)
1758 {
1759 for (i=xl-1;i>=18;i--)
1760 {
1761 w=gx[i]; gx[i]=0;
1762 gx[i-18]^=(w<<5)^(w<<7)^(w<<10)^(w<<15);
1763 gx[i-17]^=(w>>27)^(w>>25)^(w>>22)^(w>>17);
1764 }
1765
1766 top=gx[17]>>27;
1767 gx[0]^=top^(top<<2)^(top<<5)^(top<<10);
1768 gx[17]&=0x7FFFFFF;
1769
1770 x->len=18;
1771 if (gx[17]==0) mr_lzero(x);
1772
1773 return;
1774 }
1775
1776 if (M==571 && A==507 && B==475 && C==417)
1777 {
1778 for (i=xl-1;i>=18;i--)
1779 {
1780 w=gx[i]; gx[i]=0;
1781 gx[i-2]^=w;
1782 gx[i-3]^=w;
1783 gx[i-4]^=(w>>26);
1784 gx[i-5]^=(w<<6);
1785 gx[i-17]^=(w>>27);
1786 gx[i-18]^=(w<<5);
1787 } /* XORs= 6 shifts= 4 */
1788 top=gx[17]>>27;
1789 gx[0]^=top;
1790 top<<=27;
1791 gx[12]^=(top<<6);
1792 gx[13]^=(top>>26);
1793 gx[14]^=top;
1794 gx[15]^=top;
1795 gx[17]^=top;
1796 x->len=18;
1797 if (gx[17]==0) mr_lzero(x);
1798 return;
1799 }
1800
1801 if (M==1223 && A==255)
1802 {
1803 for (i=xl-1;i>=39;i--)
1804 {
1805 w=gx[i]; gx[i]=0;
1806 gx[i-30]^=(w>>8);
1807 gx[i-31]^=(w<<24);
1808 gx[i-38]^=(w>>7);
1809 gx[i-39]^=(w<<25);
1810 } /* XORs= 4 shifts= 4 */
1811 top=gx[38]>>7; gx[0]^=top; top<<=7;
1812 gx[7]^=(top<<24);
1813 gx[8]^=(top>>8);
1814 gx[38]^=top;
1815 x->len=39;
1816 if (gx[38]==0) mr_lzero(x);
1817 return;
1818 }
1819
1820 #endif
1821
1822 #if MIRACL == 64
1823 if (M==1223 && A==255)
1824 {
1825 for (i=xl-1;i>=20;i--)
1826 {
1827 w=gx[i]; gx[i]=0;
1828 gx[i-15]^=(w>>8);
1829 gx[i-16]^=(w<<56);
1830 gx[i-19]^=(w>>7);
1831 gx[i-20]^=(w<<57);
1832 }
1833 top=gx[19]>>7; gx[0]^=top; top<<=7;
1834 gx[3]^=(top<<56);
1835 gx[4]^=(top>>8);
1836 gx[19]^=top;
1837 x->len=20;
1838 if (gx[19]==0) mr_lzero(x);
1839 return;
1840 }
1841
1842 if (M==379 && A==253 && B==251 && C==59)
1843 {
1844 for (i=xl-1;i>=6;i--)
1845 {
1846 w=gx[i]; gx[i]=0;
1847 gx[i-1]^=(w>>62);
1848 gx[i-2]^=(w<<2)^w;
1849 gx[i-5]^=(w>>59)^w;
1850 gx[i-6]^=(w<<5);
1851 } /* XORs= 6 shifts= 4 */
1852 top=gx[5]>>59; gx[0]^=top; top<<=59;
1853 gx[0]^=top;
1854 gx[3]^=(top<<2)^top;
1855 gx[4]^=(top>>62);
1856 gx[5]^=top;
1857 x->len=6;
1858 if (gx[5]==0) mr_lzero(x);
1859 return;
1860 }
1861 #endif
1862
1863 k3=k4=rs3=ls3=rs4=ls4=0;
1864 k1=1+M/MIRACL; /* words from MSB to LSB */
1865
1866 if (xl<=k1)
1867 {
1868 if (numbits(x)<=M) return;
1869 }
1870
1871 rs1=M%MIRACL;
1872 ls1=MIRACL-rs1;
1873
1874 if (M-A < MIRACL)
1875 { /* slow way */
1876 while (numbits(x)>=M+1)
1877 {
1878 copy(mr_mip->modulus,mr_mip->w7);
1879 shiftleftbits(mr_mip->w7,numbits(x)-M-1);
1880 add2(x,mr_mip->w7,x);
1881 }
1882 return;
1883 }
1884
1885 k2=1+(M-A)/MIRACL; /* words from MSB to bit */
1886 rs2=(M-A)%MIRACL;
1887 ls2=MIRACL-rs2;
1888
1889 if (B)
1890 { /* Pentanomial */
1891 k3=1+(M-B)/MIRACL;
1892 rs3=(M-B)%MIRACL;
1893 ls3=MIRACL-rs3;
1894
1895 k4=1+(M-C)/MIRACL;
1896 rs4=(M-C)%MIRACL;
1897 ls4=MIRACL-rs4;
1898 }
1899
1900 for (i=xl-1;i>=k1;i--)
1901 {
1902 w=gx[i]; gx[i]=0;
1903 if (rs1==0) gx[i-k1+1]^=w;
1904 else
1905 {
1906 gx[i-k1+1]^=(w>>rs1);
1907 gx[i-k1]^=(w<<ls1);
1908 }
1909 if (rs2==0) gx[i-k2+1]^=w;
1910 else
1911 {
1912 gx[i-k2+1]^=(w>>rs2);
1913 gx[i-k2]^=(w<<ls2);
1914 }
1915 if (B)
1916 {
1917 if (rs3==0) gx[i-k3+1]^=w;
1918 else
1919 {
1920 gx[i-k3+1]^=(w>>rs3);
1921 gx[i-k3]^=(w<<ls3);
1922 }
1923 if (rs4==0) gx[i-k4+1]^=w;
1924 else
1925 {
1926 gx[i-k4+1]^=(w>>rs4);
1927 gx[i-k4]^=(w<<ls4);
1928 }
1929 }
1930 }
1931
1932 top=gx[k1-1]>>rs1;
1933
1934 if (top!=0)
1935 {
1936 gx[0]^=top;
1937 top<<=rs1;
1938
1939 if (rs2==0) gx[k1-k2]^=top;
1940 else
1941 {
1942 gx[k1-k2]^=(top>>rs2);
1943 if (k1>k2) gx[k1-k2-1]^=(top<<ls2);
1944 }
1945 if (B)
1946 {
1947 if (rs3==0) gx[k1-k3]^=top;
1948 else
1949 {
1950 gx[k1-k3]^=(top>>rs3);
1951 if (k1>k3) gx[k1-k3-1]^=(top<<ls3);
1952 }
1953 if (rs4==0) gx[k1-k4]^=top;
1954 else
1955 {
1956 gx[k1-k4]^=(top>>rs4);
1957 if (k1>k4) gx[k1-k4-1]^=(top<<ls4);
1958 }
1959 }
1960 gx[k1-1]^=top;
1961 }
1962 x->len=k1;
1963 if (gx[k1-1]==0) mr_lzero(x);
1964 }
1965
incr2(big x,int n,big w)1966 void incr2(big x,int n,big w)
1967 { /* increment x by small amount */
1968 if (x!=w) copy(x,w);
1969 if (n==0) return;
1970 if (w->len==0)
1971 {
1972 w->len=1;
1973 w->w[0]=n;
1974 }
1975 else
1976 {
1977 w->w[0]^=(mr_small)n;
1978 if (w->len==1 && w->w[0]==0) w->len=0;
1979 }
1980 }
1981
modsquare2(_MIPD_ big x,big w)1982 void modsquare2(_MIPD_ big x,big w)
1983 { /* w=x*x mod f */
1984 #ifdef MR_OS_THREADS
1985 miracl *mr_mip=get_mip();
1986 #endif
1987
1988 square2(x,mr_mip->w0);
1989 reduce2(_MIPP_ mr_mip->w0,mr_mip->w0);
1990 copy(mr_mip->w0,w);
1991 }
1992
1993 /* Experimental code for GF(2^103) modular multiplication *
1994 * Inspired by Robert Harley's ECDL code */
1995
1996 #ifdef SP103
1997
1998 #ifdef __GNUC__
1999 #include <xmmintrin.h>
2000 #else
2001 #include <emmintrin.h>
2002 #endif
2003
modmult2(_MIPD_ big x,big y,big w)2004 void modmult2(_MIPD_ big x,big y,big w)
2005 {
2006 int i,j;
2007 mr_small b;
2008
2009 __m128i t[16];
2010 __m128i m,r,s,p,q,xe,xo;
2011 __m64 a3,a2,a1,a0,top;
2012
2013 if (x==y)
2014 {
2015 modsquare2(_MIPP_ x,w);
2016 return;
2017 }
2018
2019 if (x->len==0 || y->len==0)
2020 {
2021 zero(w);
2022 return;
2023 }
2024
2025 #ifdef MR_COUNT_OPS
2026 fpm2++;
2027 #endif
2028
2029 m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */
2030
2031 /* precompute a small table */
2032
2033 t[0]=_mm_set1_epi32(0);
2034 xe=_mm_set_epi32(0,x->w[2],0,x->w[0]);
2035 xo=_mm_set_epi32(0,x->w[3],0,x->w[1]);
2036 t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2037 xe=_mm_slli_epi64(xe,1);
2038 xo=_mm_slli_epi64(xo,1);
2039 t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2040 t[3]=_mm_xor_si128(t[2],t[1]);
2041 xe=_mm_slli_epi64(xe,1);
2042 xo=_mm_slli_epi64(xo,1);
2043 t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2044 t[5]=_mm_xor_si128(t[4],t[1]);
2045 t[6]=_mm_xor_si128(t[4],t[2]);
2046 t[7]=_mm_xor_si128(t[4],t[3]);
2047 xe=_mm_slli_epi64(xe,1);
2048 xo=_mm_slli_epi64(xo,1);
2049 t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2050 t[9]=_mm_xor_si128(t[8],t[1]);
2051 t[10]=_mm_xor_si128(t[8],t[2]);
2052 t[11]=_mm_xor_si128(t[8],t[3]);
2053 t[12]=_mm_xor_si128(t[8],t[4]);
2054 t[13]=_mm_xor_si128(t[8],t[5]);
2055 t[14]=_mm_xor_si128(t[8],t[6]);
2056 t[15]=_mm_xor_si128(t[8],t[7]);
2057
2058 b=y->w[0];
2059
2060 i=b&0xf; j=(b>>4)&0xf; r=t[j];
2061 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2062 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */
2063 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2064 p=q=r; q=_mm_srli_si128(q,1);
2065
2066 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2067 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2068 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2069 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2070 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1);
2071 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2072
2073 i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
2074 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2075 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2076 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2077 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2);
2078 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2079
2080 i=(b>>24)&0xf; j=(b>>28); r=t[j];
2081 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2082 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2083 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2084 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3);
2085 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2086
2087 b=y->w[1];
2088
2089 i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
2090 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2091 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2092 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2093 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4);
2094 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2095
2096 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2097 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2098 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2099 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2100 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5);
2101 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2102
2103 i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
2104 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2105 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2106 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2107 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6);
2108 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2109
2110 i=(b>>24)&0xf; j=(b>>28); r=t[j];
2111 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2112 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2113 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2114 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7);
2115 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2116
2117 b=y->w[2];
2118
2119 i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
2120 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2121 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2122 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2123 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8);
2124 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2125
2126 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2127 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2128 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2129 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2130 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9);
2131 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2132
2133 i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
2134 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2135 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2136 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2137 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,10);
2138 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2139
2140 i=(b>>24)&0xf; j=(b>>28); r=t[j];
2141 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2142 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2143 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2144 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,11);
2145 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2146
2147 b=y->w[3];
2148
2149 i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
2150 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2151 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2152 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2153 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,12);
2154 p=_mm_xor_si128(p,r);
2155
2156 q=_mm_srli_si128(q,4); /* only 103 bits, so we are done */
2157
2158 /* modular reduction - x^103+x^9+1 */
2159
2160 a0=_mm_movepi64_pi64(p);
2161 a1=_mm_movepi64_pi64(_mm_srli_si128(p,8));
2162 a2=_mm_movepi64_pi64(q);
2163 a3=_mm_movepi64_pi64(_mm_srli_si128(q,8));
2164
2165 a2=_m_pxor(a2,_m_psrlqi(a3,39));
2166 a2=_m_pxor(a2,_m_psrlqi(a3,30));
2167 a1=_m_pxor(a1,_m_psllqi(a3,25));
2168 a1=_m_pxor(a1,_m_psllqi(a3,34));
2169
2170 a1=_m_pxor(a1,_m_psrlqi(a2,39));
2171 a1=_m_pxor(a1,_m_psrlqi(a2,30));
2172 a0=_m_pxor(a0,_m_psllqi(a2,25));
2173 a0=_m_pxor(a0,_m_psllqi(a2,34));
2174
2175 top=_m_psrlqi(a1,39);
2176 a0=_m_pxor(a0,top);
2177 top=_m_psllqi(top,39);
2178 a0=_m_pxor(a0,_m_psrlqi(top,30));
2179 a1=_m_pxor(a1,top);
2180
2181 if (w->len>4) zero(w);
2182
2183 w->w[0]=_m_to_int(a0);
2184 a0=_m_psrlqi(a0,32);
2185 w->w[1]=_m_to_int(a0);
2186 w->w[2]=_m_to_int(a1);
2187 a1=_m_psrlqi(a1,32);
2188 w->w[3]=_m_to_int(a1);
2189
2190 w->len=4;
2191 if (w->w[3]==0) mr_lzero(w);
2192 _m_empty();
2193 }
2194
2195 #endif
2196
2197 #ifdef SP79
2198
2199 #ifdef __GNUC__
2200 #include <xmmintrin.h>
2201 #else
2202 #include <emmintrin.h>
2203 #endif
2204
modmult2(_MIPD_ big x,big y,big w)2205 void modmult2(_MIPD_ big x,big y,big w)
2206 {
2207 int i,j;
2208 mr_small b;
2209
2210 __m128i t[16];
2211 __m128i m,r,s,p,q,xe,xo;
2212 __m64 a2,a1,a0,top;
2213
2214 if (x==y)
2215 {
2216 modsquare2(_MIPP_ x,w);
2217 return;
2218 }
2219 #ifdef MR_COUNT_OPS
2220 fpm2++;
2221 #endif
2222 if (x->len==0 || y->len==0)
2223 {
2224 zero(w);
2225 return;
2226 }
2227
2228 m=_mm_set_epi32(0,0,0xff<<24,0); /* shifting mask */
2229
2230 /* precompute a small table */
2231
2232 t[0]=_mm_set1_epi32(0);
2233 xe=_mm_set_epi32(0,x->w[2],0,x->w[0]);
2234 xo=_mm_set_epi32(0,0,0,x->w[1]);
2235 t[1]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2236 xe=_mm_slli_epi64(xe,1);
2237 xo=_mm_slli_epi64(xo,1);
2238 t[2]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2239 t[3]=_mm_xor_si128(t[2],t[1]);
2240 xe=_mm_slli_epi64(xe,1);
2241 xo=_mm_slli_epi64(xo,1);
2242 t[4]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2243 t[5]=_mm_xor_si128(t[4],t[1]);
2244 t[6]=_mm_xor_si128(t[4],t[2]);
2245 t[7]=_mm_xor_si128(t[4],t[3]);
2246 xe=_mm_slli_epi64(xe,1);
2247 xo=_mm_slli_epi64(xo,1);
2248 t[8]=_mm_xor_si128(xe,_mm_slli_si128(xo,4));
2249 t[9]=_mm_xor_si128(t[8],t[1]);
2250 t[10]=_mm_xor_si128(t[8],t[2]);
2251 t[11]=_mm_xor_si128(t[8],t[3]);
2252 t[12]=_mm_xor_si128(t[8],t[4]);
2253 t[13]=_mm_xor_si128(t[8],t[5]);
2254 t[14]=_mm_xor_si128(t[8],t[6]);
2255 t[15]=_mm_xor_si128(t[8],t[7]);
2256
2257 b=y->w[0];
2258
2259 i=b&0xf; j=(b>>4)&0xf; r=t[j];
2260 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2261 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4); /* net shift left 4 */
2262 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2263 p=q=r; q=_mm_srli_si128(q,1);
2264
2265 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2266 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2267 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2268 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2269 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,1);
2270 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2271
2272 i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
2273 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2274 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2275 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2276 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,2);
2277 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2278
2279 i=(b>>24)&0xf; j=(b>>28); r=t[j];
2280 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2281 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2282 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2283 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,3);
2284 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2285
2286 b=y->w[1];
2287
2288 i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
2289 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2290 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2291 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2292 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,4);
2293 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2294
2295 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2296 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2297 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2298 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2299 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,5);
2300 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2301
2302 i=(b>>16)&0xf; j=(b>>20)&0xf; r=t[j];
2303 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2304 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2305 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2306 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,6);
2307 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2308
2309 i=(b>>24)&0xf; j=(b>>28); r=t[j];
2310 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2311 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2312 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2313 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,7);
2314 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2315
2316 b=y->w[2];
2317
2318 i=(b)&0xf; j=(b>>4)&0xf; r=t[j];
2319 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2320 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2321 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2322 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,8);
2323 p=_mm_xor_si128(p,r); q=_mm_srli_si128(q,1);
2324
2325 i=(b>>8)&0xf; j=(b>>12)&0xf; r=t[j];
2326 s=_mm_and_si128(r,m); r=_mm_slli_epi64(r,4);
2327 s=_mm_slli_si128(s,1); s=_mm_srli_epi64(s,4);
2328 r=_mm_xor_si128(r,s); r=_mm_xor_si128(r,t[i]);
2329 q=_mm_xor_si128(q,r); r=_mm_slli_si128(r,9);
2330 p=_mm_xor_si128(p,r);
2331
2332 q=_mm_srli_si128(q,7); /* only 79 bits, so we are done */
2333
2334 /* modular reduction - x^79+x^9+1 */
2335
2336 a0=_mm_movepi64_pi64(p);
2337 a1=_mm_movepi64_pi64(_mm_srli_si128(p,8));
2338 a2=_mm_movepi64_pi64(q);
2339
2340 a1=_m_pxor(a1,_m_psrlqi(a2,15));
2341 a1=_m_pxor(a1,_m_psrlqi(a2,6));
2342 a0=_m_pxor(a0,_m_psllqi(a2,49));
2343 a0=_m_pxor(a0,_m_psllqi(a2,58));
2344
2345 top=_m_psrlqi(a1,15);
2346 a0=_m_pxor(a0,top);
2347 top=_m_psllqi(top,15);
2348 a0=_m_pxor(a0,_m_psrlqi(top,6));
2349 a1=_m_pxor(a1,top);
2350
2351 w->w[2]=_m_to_int(a1);
2352
2353 if (w->len>3)
2354 { /* Yes I know its crazy, but its needed to fix the broken /O2 optimizer */
2355 for (i=3;i<w->len;i++) w->w[i]=0;
2356 }
2357
2358 w->w[0]=_m_to_int(a0);
2359 a0=_m_psrlqi(a0,32);
2360 w->w[1]=_m_to_int(a0);
2361
2362 w->len=3;
2363 if (w->w[2]==0) mr_lzero(w);
2364 _m_empty();
2365 }
2366
2367 #endif
2368
2369
2370 #ifndef SP103
2371 #ifndef SP79
2372 /*#ifndef SP271 */
2373
modmult2(_MIPD_ big x,big y,big w)2374 void modmult2(_MIPD_ big x,big y,big w)
2375 { /* w=x*y mod f */
2376 #ifdef MR_OS_THREADS
2377 miracl *mr_mip=get_mip();
2378 #endif
2379
2380 if (x==NULL || y==NULL)
2381 {
2382 zero(w);
2383 return;
2384 }
2385
2386 if (x==y)
2387 {
2388 modsquare2(_MIPP_ x,w);
2389 return;
2390 }
2391
2392 if (y->len==0)
2393 {
2394 zero(w);
2395 return;
2396 }
2397
2398 if (y->len==1)
2399 {
2400 if (y->w[0]==1)
2401 {
2402 copy(x,w);
2403 return;
2404 }
2405 }
2406
2407 #ifdef MR_COUNT_OPS
2408 fpm2++;
2409 #endif
2410
2411 multiply2(_MIPP_ x,y,mr_mip->w0);
2412 reduce2(_MIPP_ mr_mip->w0,mr_mip->w0);
2413 copy(mr_mip->w0,w);
2414 }
2415
2416 #endif
2417 #endif
2418 /*#endif*/
2419
2420 /* Will be *much* faster if M,A,(B and C) are all odd */
2421 /* This could/should be optimized for a particular irreducible polynomial and fixed A, B and C */
2422
sqroot2(_MIPD_ big x,big y)2423 void sqroot2(_MIPD_ big x,big y)
2424 {
2425 int i,M,A,B,C;
2426 int k,n,h,s,a,aw,ab,bw,bb,cw,cb;
2427 #if MIRACL != 32
2428 int mm,j;
2429 #endif
2430 mr_small *wk,w,we,wo;
2431 BOOL slow;
2432 /* Using Harley's trick */
2433 static const mr_small evens[16]=
2434 {0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15};
2435 static const mr_small odds[16]=
2436 {0,4,1,5,8,12,9,13,2,6,3,7,10,14,11,15};
2437
2438 #ifdef MR_OS_THREADS
2439 miracl *mr_mip=get_mip();
2440 #endif
2441 M=mr_mip->M;
2442 A=mr_mip->AA;
2443 if (A==0)
2444 {
2445 mr_berror(_MIPP_ MR_ERR_NO_BASIS);
2446 return;
2447 }
2448 B=mr_mip->BB;
2449 C=mr_mip->CC;
2450
2451 slow=FALSE;
2452 if (B)
2453 {
2454 if (M%2!=1 || A%2!=1 || B%2!=1 || C%2!=1) slow=TRUE;
2455 }
2456 else
2457 {
2458 if (M%2!=1 || A%2!=1) slow=TRUE;
2459 }
2460
2461 if (slow)
2462 {
2463 copy(x,y);
2464 for (i=1;i<mr_mip->M;i++)
2465 modsquare2(_MIPP_ y,y);
2466 return;
2467 }
2468
2469 bb=cb=cw=bw=0;
2470 /* M, A (B and C) are all odd - so use fast
2471 Fong, Hankerson, Lopez and Menezes method */
2472
2473 if (x==y)
2474 {
2475 copy (x,mr_mip->w0);
2476 wk=mr_mip->w0->w;
2477 }
2478 else
2479 {
2480 wk=x->w;
2481 }
2482 zero(y);
2483
2484 #if MIRACL==8
2485 if (M==271 && A==207 && B==175 && C==111)
2486 {
2487 y->len=34;
2488 for (i=0;i<34;i++)
2489 {
2490 n=i/2;
2491 w=wk[i];
2492
2493 we=evens[((w&0x5)+((w&0x50)>>3))];
2494 wo=odds[((w&0xA)+((w&0xA0)>>5))];
2495
2496 i++;
2497 w=wk[i];
2498
2499 we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
2500 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
2501
2502 y->w[n]^=we;
2503 y->w[n+17]=wo;
2504
2505 y->w[n+13]^=wo;
2506 y->w[n+11]^=wo;
2507 y->w[n+7]^=wo;
2508 }
2509 if (y->w[33]==0) mr_lzero(y);
2510 return;
2511 }
2512 #endif
2513
2514 #if MIRACL==32
2515 if (M==1223 && A==255)
2516 {
2517 y->len=39;
2518 for (i=0;i<39;i++)
2519 {
2520 n=i/2;
2521 w=wk[i];
2522
2523 we=evens[((w&0x5)+((w&0x50)>>3))];
2524 wo=odds[((w&0xA)+((w&0xA0)>>5))];
2525 w>>=8;
2526 we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
2527 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
2528 w>>=8;
2529 we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
2530 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
2531 w>>=8;
2532 we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
2533 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
2534
2535 i++;
2536 if (i<39)
2537 {
2538 w=wk[i];
2539
2540 we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
2541 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
2542 w>>=8;
2543 we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
2544 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
2545 w>>=8;
2546 we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
2547 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
2548 w>>=8;
2549 we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
2550 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
2551 }
2552 y->w[n]^=we;
2553
2554 y->w[20+n-1]^=wo<<4;
2555 y->w[20+n]^=wo>>28;
2556
2557 y->w[n+4]^=wo;
2558 }
2559 if (y->w[38]==0) mr_lzero(y);
2560 return;
2561 }
2562
2563 #endif
2564
2565 #if MIRACL==64
2566 if (M==1223 && A==255)
2567 {
2568 y->len=20;
2569 for (i=0;i<20;i++)
2570 {
2571 n=i/2;
2572 w=wk[i];
2573
2574 we=evens[((w&0x5)+((w&0x50)>>3))];
2575 wo=odds[((w&0xA)+((w&0xA0)>>5))];
2576 w>>=8;
2577 we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
2578 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
2579 w>>=8;
2580 we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
2581 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
2582 w>>=8;
2583 we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
2584 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
2585 w>>=8;
2586 we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
2587 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
2588 w>>=8;
2589 we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
2590 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
2591 w>>=8;
2592 we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
2593 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
2594 w>>=8;
2595 we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
2596 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
2597
2598 i++;
2599 w=wk[i];
2600
2601 we|=evens[((w&0x5)+((w&0x50)>>3))]<<32;
2602 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<32;
2603 w>>=8;
2604 we|=evens[((w&0x5)+((w&0x50)>>3))]<<36;
2605 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<36;
2606 w>>=8;
2607 we|=evens[((w&0x5)+((w&0x50)>>3))]<<40;
2608 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<40;
2609 w>>=8;
2610 we|=evens[((w&0x5)+((w&0x50)>>3))]<<44;
2611 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<44;
2612 w>>=8;
2613 we|=evens[((w&0x5)+((w&0x50)>>3))]<<48;
2614 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<48;
2615 w>>=8;
2616 we|=evens[((w&0x5)+((w&0x50)>>3))]<<52;
2617 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<52;
2618 w>>=8;
2619 we|=evens[((w&0x5)+((w&0x50)>>3))]<<56;
2620 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<56;
2621 w>>=8;
2622 we|=evens[((w&0x5)+((w&0x50)>>3))]<<60;
2623 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<60;
2624
2625 y->w[n]^=we;
2626
2627 y->w[10+n-1]^=wo<<36;
2628 y->w[10+n]^=wo>>28;
2629
2630 y->w[n+2]^=wo;
2631 }
2632 if (y->w[19]==0) mr_lzero(y);
2633 return;
2634 }
2635
2636 #endif
2637
2638 k=1+(M/MIRACL);
2639 h=(k+1)/2;
2640
2641 a=(A+1)/2;
2642 aw=a/MIRACL;
2643 ab=a%MIRACL;
2644
2645 if (B)
2646 {
2647 a=(B+1)/2;
2648 bw=a/MIRACL;
2649 bb=a%MIRACL;
2650
2651 a=(C+1)/2;
2652 cw=a/MIRACL;
2653 cb=a%MIRACL;
2654 }
2655 s=h*MIRACL-1-(M-1)/2;
2656
2657 y->len=k;
2658 for (i=0;i<k;i++)
2659 {
2660 n=i/2;
2661 w=wk[i];
2662
2663 #if MIRACL == 32
2664
2665 we=evens[((w&0x5)+((w&0x50)>>3))];
2666 wo=odds[((w&0xA)+((w&0xA0)>>5))];
2667 w>>=8;
2668
2669 we|=evens[((w&0x5)+((w&0x50)>>3))]<<4;
2670 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<4;
2671 w>>=8;
2672
2673 we|=evens[((w&0x5)+((w&0x50)>>3))]<<8;
2674 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<8;
2675 w>>=8;
2676
2677 we|=evens[((w&0x5)+((w&0x50)>>3))]<<12;
2678 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<12;
2679
2680 #else
2681 mm=0;
2682 we=wo=0;
2683 for (j=0;j<MIRACL/8;j++)
2684 {
2685 we|=evens[((w&0x5)+((w&0x50)>>3))]<<mm;
2686 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<mm;
2687 mm+=4; w>>=8;
2688 }
2689
2690 #endif
2691 i++;
2692 if (i<k)
2693 {
2694 w=wk[i];
2695 #if MIRACL == 32
2696
2697 we|=evens[((w&0x5)+((w&0x50)>>3))]<<16;
2698 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<16;
2699 w>>=8;
2700
2701 we|=evens[((w&0x5)+((w&0x50)>>3))]<<20;
2702 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<20;
2703 w>>=8;
2704
2705 we|=evens[((w&0x5)+((w&0x50)>>3))]<<24;
2706 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<24;
2707 w>>=8;
2708
2709 we|=evens[((w&0x5)+((w&0x50)>>3))]<<28;
2710 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<28;
2711
2712
2713 #else
2714 for (j=0;j<MIRACL/8;j++)
2715 {
2716 we|=evens[((w&0x5)+((w&0x50)>>3))]<<mm;
2717 wo|=odds[((w&0xA)+((w&0xA0)>>5))]<<mm;
2718 mm+=4; w>>=8;
2719 }
2720
2721 #endif
2722 }
2723 y->w[n]^=we;
2724
2725 if (s==0) y->w[h+n]=wo;
2726 else
2727 {
2728 y->w[h+n-1]^=wo<<(MIRACL-s);
2729 y->w[h+n]^=wo>>s; /* abutt odd bits to even */
2730 }
2731 if (ab==0) y->w[n+aw]^=wo;
2732 else
2733 {
2734 y->w[n+aw]^=wo<<ab;
2735 y->w[n+aw+1]^=wo>>(MIRACL-ab);
2736 }
2737 if (B)
2738 {
2739 if (bb==0) y->w[n+bw]^=wo;
2740 else
2741 {
2742 y->w[n+bw]^=wo<<bb;
2743 y->w[n+bw+1]^=wo>>(MIRACL-bb);
2744 }
2745 if (cb==0) y->w[n+cw]^=wo;
2746 else
2747 {
2748 y->w[n+cw]^=wo<<cb;
2749 y->w[n+cw+1]^=wo>>(MIRACL-cb);
2750 }
2751 }
2752 }
2753
2754 if (y->w[k-1]==0) mr_lzero(y);
2755 }
2756
2757 #ifndef MR_STATIC
2758
power2(_MIPD_ big x,int m,big w)2759 void power2(_MIPD_ big x,int m,big w)
2760 { /* w=x^m mod f. Could be optimised a lot, but not time critical for me */
2761 #ifdef MR_OS_THREADS
2762 miracl *mr_mip=get_mip();
2763 #endif
2764 copy(x,mr_mip->w1);
2765
2766 convert(_MIPP_ 1,w);
2767 forever
2768 {
2769 if (m%2!=0)
2770 modmult2(_MIPP_ w,mr_mip->w1,w);
2771 m/=2;
2772 if (m==0) break;
2773 modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1);
2774 }
2775 }
2776
2777 #endif
2778
2779 /* Euclidean Algorithm */
2780
inverse2(_MIPD_ big x,big w)2781 BOOL inverse2(_MIPD_ big x,big w)
2782 {
2783 mr_small bit;
2784 int i,j,n,n3,k,n4,mb,mw;
2785 big t;
2786 BOOL newword;
2787 #ifdef MR_OS_THREADS
2788 miracl *mr_mip=get_mip();
2789 #endif
2790 if (size(x)==0) return FALSE;
2791
2792 convert(_MIPP_ 1,mr_mip->w1);
2793 zero(mr_mip->w2);
2794 copy(x,mr_mip->w3);
2795 copy(mr_mip->modulus,mr_mip->w4);
2796
2797 n3=numbits(mr_mip->w3);
2798 n4=mr_mip->M+1;
2799
2800 #ifdef MR_COUNT_OPS
2801 fpi2++;
2802 #endif
2803
2804 while (n3!=1)
2805 {
2806 j=n3-n4;
2807
2808 if (j<0)
2809 {
2810 t=mr_mip->w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t;
2811 t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t;
2812 j=-j; n=n3; n3=n4; n4=n;
2813 }
2814
2815 mw=j/MIRACL; mb=j%MIRACL;
2816
2817 if (n3<MIRACL)
2818 {
2819 mr_mip->w3->w[0]^=mr_mip->w4->w[0]<<mb;
2820 n3--;
2821
2822 bit=((mr_small)1<<((n3-1)%MIRACL));
2823
2824 while (!(mr_mip->w3->w[0]&bit))
2825 {
2826 n3--;
2827 bit>>=1;
2828 }
2829 }
2830 else
2831 {
2832 k=mr_mip->w3->len;
2833 if (mb==0)
2834 {
2835 for (i=mw;i<k;i++)
2836 mr_mip->w3->w[i]^=mr_mip->w4->w[i-mw];
2837 }
2838 else
2839 {
2840 mr_mip->w3->w[mw]^=mr_mip->w4->w[0]<<mb;
2841 for (i=mw+1;i<k;i++)
2842 mr_mip->w3->w[i]^=((mr_mip->w4->w[i-mw]<<mb) | (mr_mip->w4->w[i-mw-1]>>(MIRACL-mb)));
2843 }
2844
2845 newword=FALSE;
2846 while (mr_mip->w3->w[k-1]==0) {k--; newword=TRUE;}
2847
2848 /*
2849 bit=mr_mip->w3->w[k-1];
2850 ASM mov eax,bit
2851 ASM bsr ecx,eax
2852 ASM mov shift,ecx
2853 n3=(k-1)*MIRACL+shift+1;
2854
2855 */
2856 if (newword)
2857 {
2858 bit=TOPBIT;
2859 n3=k*MIRACL;
2860 }
2861 else
2862 {
2863 n3--;
2864 bit=((mr_small)1<<((n3-1)%MIRACL));
2865 }
2866 while (!(mr_mip->w3->w[k-1]&bit))
2867 {
2868 n3--;
2869 bit>>=1;
2870 }
2871
2872 mr_mip->w3->len=k;
2873 }
2874
2875 k=mr_mip->w2->len+mw+1;
2876 if ((int)mr_mip->w1->len>k) k=mr_mip->w1->len;
2877
2878 if (mb==0)
2879 {
2880 for (i=mw;i<k;i++)
2881 mr_mip->w1->w[i]^=mr_mip->w2->w[i-mw];
2882 }
2883 else
2884 {
2885 mr_mip->w1->w[mw]^=mr_mip->w2->w[0]<<mb;
2886 for (i=mw+1;i<k;i++)
2887 mr_mip->w1->w[i]^=((mr_mip->w2->w[i-mw]<<mb) | (mr_mip->w2->w[i-mw-1]>>(MIRACL-mb)));
2888 }
2889 while (mr_mip->w1->w[k-1]==0) k--;
2890 mr_mip->w1->len=k;
2891 }
2892
2893 copy(mr_mip->w1,w);
2894 return TRUE;
2895 }
2896
2897 /* Schroeppel, Orman, O'Malley, Spatscheck *
2898 * "Almost Inverse" algorithm, Crypto '95 *
2899 * More optimization here and in-lining would *
2900 * speed up AFFINE mode. I observe that *
2901 * pentanomials would be more efficient if C *
2902 * were greater */
2903
2904 /*
2905 BOOL inverse2(_MIPD_ big x,big w)
2906 {
2907 mr_small lsw,*gw;
2908 int i,n,bits,step,n3,n4,k;
2909 int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4;
2910 int M,A,B,C;
2911 big t;
2912 #ifdef MR_OS_THREADS
2913 miracl *mr_mip=get_mip();
2914 #endif
2915 if (size(x)==0) return FALSE;
2916
2917 M=mr_mip->M;
2918 A=mr_mip->AA;
2919 if (A==0)
2920 {
2921 mr_berror(_MIPP_ MR_ERR_NO_BASIS);
2922 return FALSE;
2923 }
2924
2925 B=mr_mip->BB;
2926 C=mr_mip->CC;
2927 convert(_MIPP_ 1,mr_mip->w1);
2928 zero(mr_mip->w2);
2929 copy(x,mr_mip->w3);
2930 copy(mr_mip->modulus,mr_mip->w4);
2931
2932 bits=zerobits(mr_mip->w3);
2933 shiftrightbits(mr_mip->w3,bits);
2934 k=bits;
2935 n3=numbits(mr_mip->w3);
2936 n4=M+1;
2937
2938 if (n3>1) forever
2939 {
2940 if (n3<n4)
2941 {
2942 t=mr_mip->w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t;
2943 t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t;
2944 n=n3; n3=n4; n4=n;
2945 }
2946
2947 add2(mr_mip->w3,mr_mip->w4,mr_mip->w3);
2948
2949 add2(mr_mip->w1,mr_mip->w2,mr_mip->w1);
2950
2951 if (n3==n4) n3=numbits(mr_mip->w3);
2952 bits=zerobits(mr_mip->w3);
2953 k+=bits;
2954 n3-=bits;
2955 if (n3==1) break;
2956 shiftrightbits(mr_mip->w3,bits);
2957 shiftleftbits(mr_mip->w2,bits);
2958 }
2959
2960 copy(mr_mip->w1,w);
2961
2962 if (k==0)
2963 {
2964 mr_lzero(w);
2965 return TRUE;
2966 }
2967 step=MIRACL;
2968
2969 if (A<MIRACL) step=A;
2970
2971 k1=1+M/MIRACL;
2972 rs1=M%MIRACL;
2973 ls1=MIRACL-rs1;
2974
2975 k2=1+A/MIRACL;
2976 rs2=A%MIRACL;
2977 ls2=MIRACL-rs2;
2978
2979 if (B)
2980 {
2981 if (C<MIRACL) step=C;
2982
2983 k3=1+B/MIRACL;
2984 rs3=B%MIRACL;
2985 ls3=MIRACL-rs3;
2986
2987 k4=1+C/MIRACL;
2988 rs4=C%MIRACL;
2989 ls4=MIRACL-rs4;
2990 }
2991
2992 gw=w->w;
2993 while (k>0)
2994 {
2995 if (k>step) n=step;
2996 else n=k;
2997
2998 if (n==MIRACL) lsw=gw[0];
2999 else lsw=gw[0]&(((mr_small)1<<n)-1);
3000
3001 w->len=k1;
3002 if (rs1==0) gw[k1-1]^=lsw;
3003 else
3004 {
3005 w->len++;
3006 gw[k1]^=(lsw>>ls1);
3007 gw[k1-1]^=(lsw<<rs1);
3008 }
3009 if (rs2==0) gw[k2-1]^=lsw;
3010 else
3011 {
3012 gw[k2]^=(lsw>>ls2);
3013 gw[k2-1]^=(lsw<<rs2);
3014 }
3015 if (B)
3016 {
3017 if (rs3==0) gw[k3-1]^=lsw;
3018 else
3019 {
3020 gw[k3]^=(lsw>>ls3);
3021 gw[k3-1]^=(lsw<<rs3);
3022 }
3023 if (rs4==0) gw[k4-1]^=lsw;
3024 else
3025 {
3026 gw[k4]^=(lsw>>ls4);
3027 gw[k4-1]^=(lsw<<rs4);
3028 }
3029 }
3030 shiftrightbits(w,n);
3031 k-=n;
3032 }
3033 mr_lzero(w);
3034 return TRUE;
3035 }
3036
3037 */
3038
multi_inverse2(_MIPD_ int m,big * x,big * w)3039 BOOL multi_inverse2(_MIPD_ int m,big *x,big *w)
3040 { /* find w[i]=1/x[i] mod f, for i=0 to m-1 */
3041 int i;
3042 #ifdef MR_OS_THREADS
3043 miracl *mr_mip=get_mip();
3044 #endif
3045 if (m==0) return TRUE;
3046 if (m<0) return FALSE;
3047
3048 if (x==w)
3049 {
3050 mr_berror(_MIPP_ MR_ERR_BAD_PARAMETERS);
3051 return FALSE;
3052 }
3053 if (m==1)
3054 {
3055 inverse2(_MIPP_ x[0],w[0]);
3056 return TRUE;
3057 }
3058 convert(_MIPP_ 1,w[0]);
3059 copy(x[0],w[1]);
3060 for (i=2;i<m;i++)
3061 modmult2(_MIPP_ w[i-1],x[i-1],w[i]);
3062
3063 modmult2(_MIPP_ w[m-1],x[m-1],mr_mip->w6);
3064 if (size(mr_mip->w6)==0)
3065 {
3066 mr_berror(_MIPP_ MR_ERR_DIV_BY_ZERO);
3067 return FALSE;
3068 }
3069
3070 inverse2(_MIPP_ mr_mip->w6,mr_mip->w6); /* y=1/y */
3071
3072 copy(x[m-1],mr_mip->w5);
3073 modmult2(_MIPP_ w[m-1],mr_mip->w6,w[m-1]);
3074
3075 for (i=m-2;;i--)
3076 {
3077 if (i==0)
3078 {
3079 modmult2(_MIPP_ mr_mip->w5,mr_mip->w6,w[0]);
3080 break;
3081 }
3082 modmult2(_MIPP_ w[i],mr_mip->w5,w[i]);
3083 modmult2(_MIPP_ w[i],mr_mip->w6,w[i]);
3084 modmult2(_MIPP_ mr_mip->w5,x[i],mr_mip->w5);
3085 }
3086 return TRUE;
3087 }
3088
3089 #ifndef MR_STATIC
3090
trace2(_MIPD_ big x)3091 int trace2(_MIPD_ big x)
3092 {
3093 int i;
3094 #ifdef MR_OS_THREADS
3095 miracl *mr_mip=get_mip();
3096 #endif
3097 copy(x,mr_mip->w1);
3098 for (i=1;i<mr_mip->M;i++)
3099 {
3100 modsquare2(_MIPP_ mr_mip->w1,mr_mip->w1);
3101 add2(mr_mip->w1,x,mr_mip->w1);
3102 }
3103 return (int)(mr_mip->w1->w[0]&1);
3104 }
3105
3106 #endif
3107
3108 #ifndef MR_NO_RAND
3109
rand2(_MIPD_ big x)3110 void rand2(_MIPD_ big x)
3111 { /* random number */
3112 int i,k;
3113 #ifdef MR_OS_THREADS
3114 miracl *mr_mip=get_mip();
3115 #endif
3116 zero(x);
3117 k=1+mr_mip->M/MIRACL;
3118 x->len=k;
3119 for (i=0;i<k;i++) x->w[i]=brand(_MIPPO_ );
3120 mr_lzero(x);
3121 reduce2(_MIPP_ x,x);
3122 }
3123
3124 #endif
3125
parity2(big x)3126 int parity2(big x)
3127 { /* return LSB */
3128 if (x->len==0) return 0;
3129 return (int)(x->w[0]%2);
3130 }
3131
halftrace2(_MIPD_ big b,big w)3132 void halftrace2(_MIPD_ big b,big w)
3133 {
3134 int i,M;
3135 #ifdef MR_OS_THREADS
3136 miracl *mr_mip=get_mip();
3137 #endif
3138
3139 M=mr_mip->M;
3140 if (M%2==0) return;
3141
3142 copy(b,mr_mip->w1);
3143 copy(b,w);
3144
3145 for (i=1;i<=(M-1)/2;i++)
3146 {
3147 modsquare2(_MIPP_ w,w);
3148 modsquare2(_MIPP_ w,w);
3149 add2(w,mr_mip->w1,w);
3150 }
3151 }
3152
quad2(_MIPD_ big b,big w)3153 BOOL quad2(_MIPD_ big b,big w)
3154 { /* Solves x^2 + x = b for a root w *
3155 * returns TRUE if a solution exists *
3156 * the "other" solution is w+1 */
3157 int i,M;
3158 #ifdef MR_OS_THREADS
3159 miracl *mr_mip=get_mip();
3160 #endif
3161
3162 M=mr_mip->M;
3163 copy(b,mr_mip->w1);
3164 if (M%2==1) halftrace2(_MIPP_ b,w); /* M is odd, so its the Half-Trace */
3165
3166 else
3167 {
3168 zero(mr_mip->w2);
3169 forever
3170 {
3171 #ifndef MR_NO_RAND
3172 rand2(_MIPP_ mr_mip->w2);
3173 #else
3174 incr(_MIPP_ mr_mip->w2,1,mr_mip->w2);
3175 #endif
3176 zero(w);
3177 copy(mr_mip->w2,mr_mip->w3);
3178 for (i=1;i<M;i++)
3179 {
3180 modsquare2(_MIPP_ mr_mip->w3,mr_mip->w3);
3181 modmult2(_MIPP_ mr_mip->w3,mr_mip->w1,mr_mip->w4);
3182 modsquare2(_MIPP_ w,w);
3183 add2(w,mr_mip->w4,w);
3184 add2(mr_mip->w3,mr_mip->w2,mr_mip->w3);
3185 }
3186 if (size(mr_mip->w3)!=0) break;
3187 }
3188 }
3189
3190 copy(w,mr_mip->w2);
3191 modsquare2(_MIPP_ mr_mip->w2,mr_mip->w2);
3192 add2(mr_mip->w2,w,mr_mip->w2);
3193 if (mr_compare(mr_mip->w1,mr_mip->w2)==0) return TRUE;
3194 return FALSE;
3195 }
3196
3197 #ifndef MR_STATIC
3198
gf2m_dotprod(_MIPD_ int n,big * x,big * y,big w)3199 void gf2m_dotprod(_MIPD_ int n,big *x,big *y,big w)
3200 { /* dot product - only one reduction! */
3201 int i;
3202 #ifdef MR_OS_THREADS
3203 miracl *mr_mip=get_mip();
3204 #endif
3205 mr_mip->check=OFF;
3206 zero(mr_mip->w5);
3207
3208 for (i=0;i<n;i++)
3209 {
3210 multiply2(_MIPP_ x[i],y[i],mr_mip->w0);
3211 add2(mr_mip->w5,mr_mip->w0,mr_mip->w5);
3212 }
3213
3214 reduce2(_MIPP_ mr_mip->w5,mr_mip->w5);
3215 copy(mr_mip->w5,w);
3216
3217 mr_mip->check=ON;
3218 }
3219
3220 #endif
3221
prepare_basis(_MIPD_ int m,int a,int b,int c,BOOL check)3222 BOOL prepare_basis(_MIPD_ int m,int a,int b,int c,BOOL check)
3223 {
3224 int i,k,sh;
3225 #ifdef MR_OS_THREADS
3226 miracl *mr_mip=get_mip();
3227 #endif
3228 if (mr_mip->ERNUM) return FALSE;
3229
3230 if (b==0) c=0;
3231 if (m==mr_mip->M && a==mr_mip->AA && b==mr_mip->BB && c==mr_mip->CC)
3232 return TRUE; /* its already prepared... */
3233
3234 MR_IN(138)
3235 if (m <=0 || a<=0 || a>=m || b>=a)
3236 {
3237 mr_berror(_MIPP_ MR_ERR_BAD_MODULUS);
3238 MR_OUT
3239 return FALSE;
3240 }
3241
3242 mr_mip->M=m;
3243 mr_mip->AA=a;
3244 mr_mip->BB=0;
3245 mr_mip->CC=0;
3246 zero(mr_mip->modulus);
3247 convert(_MIPP_ 1,mr_mip->one);
3248
3249 k=1+m/MIRACL;
3250
3251 if (k>mr_mip->nib)
3252 {
3253 mr_berror(_MIPP_ MR_ERR_OVERFLOW);
3254 MR_OUT
3255 return FALSE;
3256 }
3257
3258 mr_mip->modulus->len=k;
3259 sh=m%MIRACL;
3260 mr_mip->modulus->w[k-1]=((mr_small)1<<sh);
3261 mr_mip->modulus->w[0]^=1;
3262 mr_mip->modulus->w[a/MIRACL]^=((mr_small)1<<(a%MIRACL));
3263 if (b!=0)
3264 {
3265 mr_mip->BB=b;
3266 mr_mip->CC=c;
3267 mr_mip->modulus->w[b/MIRACL]^=((mr_small)1<<(b%MIRACL));
3268 mr_mip->modulus->w[c/MIRACL]^=((mr_small)1<<(c%MIRACL));
3269 }
3270
3271 if (!check)
3272 {
3273 MR_OUT
3274 return TRUE;
3275 }
3276
3277 /* check for irreducibility of basis */
3278
3279 zero(mr_mip->w4);
3280 mr_mip->w4->len=1;
3281 mr_mip->w4->w[0]=2; /* f(t) = t */
3282 for (i=1;i<=m/2;i++)
3283 {
3284 modsquare2(_MIPP_ mr_mip->w4,mr_mip->w4);
3285 incr2(mr_mip->w4,2,mr_mip->w5);
3286 gcd2(_MIPP_ mr_mip->w5,mr_mip->modulus,mr_mip->w6);
3287 if (size(mr_mip->w6)!=1)
3288 {
3289 mr_berror(_MIPP_ MR_ERR_NOT_IRREDUC);
3290 MR_OUT
3291 return FALSE;
3292 }
3293 }
3294
3295 MR_OUT
3296 return TRUE;
3297 }
3298
3299 #endif
3300