1 /*
2 * lbn16.c - Low-level bignum routines, 16-bit version.
3 *
4 * Copyright (c) 1995 Colin Plumb. All rights reserved.
5 *
6 * NOTE: the magic constants "16" and "32" appear in many places in this
7 * file, including inside identifiers. Because it is not possible to
8 * ask "#ifdef" of a macro expansion, it is not possible to use the
9 * preprocessor to conditionalize these properly. Thus, this file is
10 * intended to be edited with textual search and replace to produce
11 * alternate word size versions. Any reference to the number of bits
12 * in a word must be the string "16", and that string must not appear
13 * otherwise. Any reference to twice this number must appear as "32",
14 * which likewise must not appear otherwise. Is that clear?
15 *
16 * Remember, when doubling the bit size replace the larger number (32)
17 * first, then the smaller (16). When halving the bit size, do the
18 * opposite. Otherwise, things will get wierd. Also, be sure to replace
19 * every instance that appears. (:%s/foo/bar/g in vi)
20 *
21 * These routines work with a pointer to the least-significant end of
22 * an array of WORD16s. The BIG(x), LITTLE(y) and BIGLTTLE(x,y) macros
23 * defined in lbn.h (which expand to x on a big-edian machine and y on a
24 * little-endian machine) are used to conditionalize the code to work
25 * either way. If you have no assembly primitives, it doesn't matter.
26 * Note that on a big-endian machine, the least-significant-end pointer
27 * is ONE PAST THE END. The bytes are ptr[-1] through ptr[-len].
28 * On little-endian, they are ptr[0] through ptr[len-1]. This makes
29 * perfect sense if you consider pointers to point *between* bytes rather
30 * than at them.
31 *
32 * Because the array index values are unsigned integers, ptr[-i]
33 * may not work properly, since the index -i is evaluated as an unsigned,
34 * and if pointers are wider, zero-extension will produce a positive
35 * number rahter than the needed negative. The expression used in this
36 * code, *(ptr-i) will, however, work. (The array syntax is equivalent
37 * to *(ptr+-i), which is a pretty subtle difference.)
38 *
39 * Many of these routines will get very unhappy if fed zero-length inputs.
40 * They use assert() to enforce this. An higher layer of code must make
41 * sure that these aren't called with zero-length inputs.
42 *
43 * Any of these routines can be replaced with more efficient versions
44 * elsewhere, by just #defining their names. If one of the names
45 * is #defined, the C code is not compiled in and no declaration is
46 * made. Use the BNINCLUDE file to do that. Typically, you compile
47 * asm subroutines with the same name and just, e.g.
48 * #define lbnMulAdd1_16 lbnMulAdd1_16
49 *
50 * If you want to write asm routines, start with lbnMulAdd1_16().
51 * This is the workhorse of modular exponentiation. lbnMulN1_16() is
52 * also used a fair bit, although not as much and it's defined in terms
53 * of lbnMulAdd1_16 if that has a custom version. lbnMulSub1_16 and
54 * lbnDiv21_16 are used in the usual division and remainder finding.
55 * (Not the Montgomery reduction used in modular exponentiation, though.)
56 * Once you have lbnMulAdd1_16 defined, writing the other two should
57 * be pretty easy. (Just make sure you get the sign of the subtraction
58 * in lbnMulSub1_16 right - it's dest = dest - source * k.)
59 *
60 * The only definitions that absolutely need a double-word (BNWORD32)
61 * type are lbnMulAdd1_16 and lbnMulSub1_16; if those are provided,
62 * the rest follows. lbnDiv21_16, however, is a lot slower unless you
63 * have them, and lbnModQ_16 takes after it. That one is used quite a
64 * bit for prime sieving.
65 */
66
67 #ifndef HAVE_CONFIG_H
68 #define HAVE_CONFIG_H 0
69 #endif
70 #if HAVE_CONFIG_H
71 #include "bnconfig.h"
72 #endif
73
74 /*
75 * Some compilers complain about #if FOO if FOO isn't defined,
76 * so do the ANSI-mandated thing explicitly...
77 */
78 #ifndef NO_ASSERT_H
79 #define NO_ASSERT_H 0
80 #endif
81 #ifndef NO_STRING_H
82 #define NO_STRING_H 0
83 #endif
84 #ifndef HAVE_STRINGS_H
85 #define HAVE_STRINGS_H 0
86 #endif
87
88 #if !NO_ASSERT_H
89 #include <assert.h>
90 #else
91 #define assert(x) (void)0
92 #endif
93
94 #if !NO_STRING_H
95 #include <string.h> /* For memcpy */
96 #elif HAVE_STRINGS_H
97 #include <strings.h>
98 #endif
99
100 #include "lbn.h"
101 #include "lbn16.h"
102 #include "lbnmem.h"
103
104 #include "kludge.h"
105
106 #ifndef BNWORD16
107 #error 16-bit bignum library requires a 16-bit data type
108 #endif
109
110 /* If this is defined, include bnYield() calls */
111 #if BNYIELD
112 extern int (*bnYield)(void); /* From bn.c */
113 #endif
114
115 /*
116 * Most of the multiply (and Montgomery reduce) routines use an outer
117 * loop that iterates over one of the operands - a so-called operand
118 * scanning approach. One big advantage of this is that the assembly
119 * support routines are simpler. The loops can be rearranged to have
120 * an outer loop that iterates over the product, a so-called product
121 * scanning approach. This has the advantage of writing less data
122 * and doing fewer adds to memory, so is supposedly faster. Some
123 * code has been written using a product-scanning approach, but
124 * it appears to be slower, so it is turned off by default. Some
125 * experimentation would be appreciated.
126 *
127 * (The code is also annoying to get right and not very well commented,
128 * one of my pet peeves about math libraries. I'm sorry.)
129 */
130 #ifndef PRODUCT_SCAN
131 #define PRODUCT_SCAN 0
132 #endif
133
134 /*
135 * Copy an array of words. <Marvin mode on> Thrilling, isn't it? </Marvin>
136 * This is a good example of how the byte offsets and BIGLITTLE() macros work.
137 * Another alternative would have been
138 * memcpy(dest BIG(-len), src BIG(-len), len*sizeof(BNWORD16)), but I find that
139 * putting operators into conditional macros is confusing.
140 */
141 #ifndef lbnCopy_16
142 void
lbnCopy_16(BNWORD16 * dest,BNWORD16 const * src,unsigned len)143 lbnCopy_16(BNWORD16 *dest, BNWORD16 const *src, unsigned len)
144 {
145 memcpy(BIGLITTLE(dest-len,dest), BIGLITTLE(src-len,src),
146 len * sizeof(*src));
147 }
148 #endif /* !lbnCopy_16 */
149
150 /*
151 * Fill n words with zero. This does it manually rather than calling
152 * memset because it can assume alignment to make things faster while
153 * memset can't. Note how big-endian numbers are naturally addressed
154 * using predecrement, while little-endian is postincrement.
155 */
156 #ifndef lbnZero_16
157 void
lbnZero_16(BNWORD16 * num,unsigned len)158 lbnZero_16(BNWORD16 *num, unsigned len)
159 {
160 while (len--)
161 BIGLITTLE(*--num,*num++) = 0;
162 }
163 #endif /* !lbnZero_16 */
164
165 /*
166 * Negate an array of words.
167 * Negation is subtraction from zero. Negating low-order words
168 * entails doing nothing until a non-zero word is hit. Once that
169 * is negated, a borrow is generated and never dies until the end
170 * of the number is hit. Negation with borrow, -x-1, is the same as ~x.
171 * Repeat that until the end of the number.
172 *
173 * Doesn't return borrow out because that's pretty useless - it's
174 * always set unless the input is 0, which is easy to notice in
175 * normalized form.
176 */
177 #ifndef lbnNeg_16
178 void
lbnNeg_16(BNWORD16 * num,unsigned len)179 lbnNeg_16(BNWORD16 *num, unsigned len)
180 {
181 assert(len);
182
183 /* Skip low-order zero words */
184 while (BIGLITTLE(*--num,*num) == 0) {
185 if (!--len)
186 return;
187 LITTLE(num++;)
188 }
189 /* Negate the lowest-order non-zero word */
190 *num = -*num;
191 /* Complement all the higher-order words */
192 while (--len) {
193 BIGLITTLE(--num,++num);
194 *num = ~*num;
195 }
196 }
197 #endif /* !lbnNeg_16 */
198
199
200 /*
201 * lbnAdd1_16: add the single-word "carry" to the given number.
202 * Used for minor increments and propagating the carry after
203 * adding in a shorter bignum.
204 *
205 * Technique: If we have a double-width word, presumably the compiler
206 * can add using its carry in inline code, so we just use a larger
207 * accumulator to compute the carry from the first addition.
208 * If not, it's more complex. After adding the first carry, which may
209 * be > 1, compare the sum and the carry. If the sum wraps (causing a
210 * carry out from the addition), the result will be less than each of the
211 * inputs, since the wrap subtracts a number (2^16) which is larger than
212 * the other input can possibly be. If the sum is >= the carry input,
213 * return success immediately.
214 * In either case, if there is a carry, enter a loop incrementing words
215 * until one does not wrap. Since we are adding 1 each time, the wrap
216 * will be to 0 and we can test for equality.
217 */
218 #ifndef lbnAdd1_16 /* If defined, it's provided as an asm subroutine */
219 #ifdef BNWORD32
220 BNWORD16
lbnAdd1_16(BNWORD16 * num,unsigned len,BNWORD16 carry)221 lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
222 {
223 BNWORD32 t;
224 assert(len > 0); /* Alternative: if (!len) return carry */
225
226 t = (BNWORD32)BIGLITTLE(*--num,*num) + carry;
227 BIGLITTLE(*num,*num++) = (BNWORD16)t;
228 if ((t >> 16) == 0)
229 return 0;
230 while (--len) {
231 if (++BIGLITTLE(*--num,*num++) != 0)
232 return 0;
233 }
234 return 1;
235 }
236 #else /* no BNWORD32 */
237 BNWORD16
lbnAdd1_16(BNWORD16 * num,unsigned len,BNWORD16 carry)238 lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry)
239 {
240 assert(len > 0); /* Alternative: if (!len) return carry */
241
242 if ((BIGLITTLE(*--num,*num++) += carry) >= carry)
243 return 0;
244 while (--len) {
245 if (++BIGLITTLE(*--num,*num++) != 0)
246 return 0;
247 }
248 return 1;
249 }
250 #endif
251 #endif/* !lbnAdd1_16 */
252
253 /*
254 * lbnSub1_16: subtract the single-word "borrow" from the given number.
255 * Used for minor decrements and propagating the borrow after
256 * subtracting a shorter bignum.
257 *
258 * Technique: Similar to the add, above. If there is a double-length type,
259 * use that to generate the first borrow.
260 * If not, after subtracting the first borrow, which may be > 1, compare
261 * the difference and the *negative* of the carry. If the subtract wraps
262 * (causing a borrow out from the subtraction), the result will be at least
263 * as large as -borrow. If the result < -borrow, then no borrow out has
264 * appeared and we may return immediately, except when borrow == 0. To
265 * deal with that case, use the identity that -x = ~x+1, and instead of
266 * comparing < -borrow, compare for <= ~borrow.
267 * Either way, if there is a borrow out, enter a loop decrementing words
268 * until a non-zero word is reached.
269 *
270 * Note the cast of ~borrow to (BNWORD16). If the size of an int is larger
271 * than BNWORD16, C rules say the number is expanded for the arithmetic, so
272 * the inversion will be done on an int and the value won't be quite what
273 * is expected.
274 */
275 #ifndef lbnSub1_16 /* If defined, it's provided as an asm subroutine */
276 #ifdef BNWORD32
277 BNWORD16
lbnSub1_16(BNWORD16 * num,unsigned len,BNWORD16 borrow)278 lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
279 {
280 BNWORD32 t;
281 assert(len > 0); /* Alternative: if (!len) return borrow */
282
283 t = (BNWORD32)BIGLITTLE(*--num,*num) - borrow;
284 BIGLITTLE(*num,*num++) = (BNWORD16)t;
285 if ((t >> 16) == 0)
286 return 0;
287 while (--len) {
288 if ((BIGLITTLE(*--num,*num++))-- != 0)
289 return 0;
290 }
291 return 1;
292 }
293 #else /* no BNWORD32 */
294 BNWORD16
lbnSub1_16(BNWORD16 * num,unsigned len,BNWORD16 borrow)295 lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow)
296 {
297 assert(len > 0); /* Alternative: if (!len) return borrow */
298
299 if ((BIGLITTLE(*--num,*num++) -= borrow) <= (BNWORD16)~borrow)
300 return 0;
301 while (--len) {
302 if ((BIGLITTLE(*--num,*num++))-- != 0)
303 return 0;
304 }
305 return 1;
306 }
307 #endif
308 #endif /* !lbnSub1_16 */
309
310 /*
311 * lbnAddN_16: add two bignums of the same length, returning the carry (0 or 1).
312 * One of the building blocks, along with lbnAdd1, of adding two bignums of
313 * differing lengths.
314 *
315 * Technique: Maintain a word of carry. If there is no double-width type,
316 * use the same technique as in lbnAdd1, above, to maintain the carry by
317 * comparing the inputs. Adding the carry sources is used as an OR operator;
318 * at most one of the two comparisons can possibly be true. The first can
319 * only be true if carry == 1 and x, the result, is 0. In that case the
320 * second can't possibly be true.
321 */
322 #ifndef lbnAddN_16
323 #ifdef BNWORD32
324 BNWORD16
lbnAddN_16(BNWORD16 * num1,BNWORD16 const * num2,unsigned len)325 lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
326 {
327 BNWORD32 t;
328
329 assert(len > 0);
330
331 t = (BNWORD32)BIGLITTLE(*--num1,*num1) + BIGLITTLE(*--num2,*num2++);
332 BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
333 while (--len) {
334 t = (BNWORD32)BIGLITTLE(*--num1,*num1) +
335 (BNWORD32)BIGLITTLE(*--num2,*num2++) + (t >> 16);
336 BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
337 }
338
339 return (BNWORD16)(t>>16);
340 }
341 #else /* no BNWORD32 */
342 BNWORD16
lbnAddN_16(BNWORD16 * num1,BNWORD16 const * num2,unsigned len)343 lbnAddN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
344 {
345 BNWORD16 x, carry = 0;
346
347 assert(len > 0); /* Alternative: change loop to test at start */
348
349 do {
350 x = BIGLITTLE(*--num2,*num2++);
351 carry = (x += carry) < carry;
352 carry += (BIGLITTLE(*--num1,*num1++) += x) < x;
353 } while (--len);
354
355 return carry;
356 }
357 #endif
358 #endif /* !lbnAddN_16 */
359
360 /*
361 * lbnSubN_16: add two bignums of the same length, returning the carry (0 or 1).
362 * One of the building blocks, along with subn1, of subtracting two bignums of
363 * differing lengths.
364 *
365 * Technique: If no double-width type is availble, maintain a word of borrow.
366 * First, add the borrow to the subtrahend (did you have to learn all those
367 * awful words in elementary school, too?), and if it overflows, set the
368 * borrow again. Then subtract the modified subtrahend from the next word
369 * of input, using the same technique as in subn1, above.
370 * Adding the borrows is used as an OR operator; at most one of the two
371 * comparisons can possibly be true. The first can only be true if
372 * borrow == 1 and x, the result, is 0. In that case the second can't
373 * possibly be true.
374 *
375 * In the double-word case, (BNWORD16)-(t>>16) is subtracted, rather than
376 * adding t>>16, because the shift would need to sign-extend and that's
377 * not guaranteed to happen in ANSI C, even with signed types.
378 */
379 #ifndef lbnSubN_16
380 #ifdef BNWORD32
381 BNWORD16
lbnSubN_16(BNWORD16 * num1,BNWORD16 const * num2,unsigned len)382 lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
383 {
384 BNWORD32 t;
385
386 assert(len > 0);
387
388 t = (BNWORD32)BIGLITTLE(*--num1,*num1) - BIGLITTLE(*--num2,*num2++);
389 BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
390
391 while (--len) {
392 t = (BNWORD32)BIGLITTLE(*--num1,*num1) -
393 (BNWORD32)BIGLITTLE(*--num2,*num2++) - (BNWORD16)-(t >> 16);
394 BIGLITTLE(*num1,*num1++) = (BNWORD16)t;
395 }
396
397 return -(BNWORD16)(t>>16);
398 }
399 #else
400 BNWORD16
lbnSubN_16(BNWORD16 * num1,BNWORD16 const * num2,unsigned len)401 lbnSubN_16(BNWORD16 *num1, BNWORD16 const *num2, unsigned len)
402 {
403 BNWORD16 x, borrow = 0;
404
405 assert(len > 0); /* Alternative: change loop to test at start */
406
407 do {
408 x = BIGLITTLE(*--num2,*num2++);
409 borrow = (x += borrow) < borrow;
410 borrow += (BIGLITTLE(*--num1,*num1++) -= x) > (BNWORD16)~x;
411 } while (--len);
412
413 return borrow;
414 }
415 #endif
416 #endif /* !lbnSubN_16 */
417
418 #ifndef lbnCmp_16
419 /*
420 * lbnCmp_16: compare two bignums of equal length, returning the sign of
421 * num1 - num2. (-1, 0 or +1).
422 *
423 * Technique: Change the little-endian pointers to big-endian pointers
424 * and compare from the most-significant end until a difference if found.
425 * When it is, figure out the sign of the difference and return it.
426 */
427 int
lbnCmp_16(BNWORD16 const * num1,BNWORD16 const * num2,unsigned len)428 lbnCmp_16(BNWORD16 const *num1, BNWORD16 const *num2, unsigned len)
429 {
430 BIGLITTLE(num1 -= len, num1 += len);
431 BIGLITTLE(num2 -= len, num2 += len);
432
433 while (len--) {
434 if (BIGLITTLE(*num1++ != *num2++, *--num1 != *--num2)) {
435 if (BIGLITTLE(num1[-1] < num2[-1], *num1 < *num2))
436 return -1;
437 else
438 return 1;
439 }
440 }
441 return 0;
442 }
443 #endif /* !lbnCmp_16 */
444
445 /*
446 * mul16_ppmmaa(ph,pl,x,y,a,b) is an optional routine that
447 * computes (ph,pl) = x * y + a + b. mul16_ppmma and mul16_ppmm
448 * are simpler versions. If you want to be lazy, all of these
449 * can be defined in terms of the others, so here we create any
450 * that have not been defined in terms of the ones that have been.
451 */
452
453 /* Define ones with fewer a's in terms of ones with more a's */
454 #if !defined(mul16_ppmma) && defined(mul16_ppmmaa)
455 #define mul16_ppmma(ph,pl,x,y,a) mul16_ppmmaa(ph,pl,x,y,a,0)
456 #endif
457
458 #if !defined(mul16_ppmm) && defined(mul16_ppmma)
459 #define mul16_ppmm(ph,pl,x,y) mul16_ppmma(ph,pl,x,y,0)
460 #endif
461
462 /*
463 * Use this definition to test the mul16_ppmm-based operations on machines
464 * that do not provide mul16_ppmm. Change the final "0" to a "1" to
465 * enable it.
466 */
467 #if !defined(mul16_ppmm) && defined(BNWORD32) && 0 /* Debugging */
468 #define mul16_ppmm(ph,pl,x,y) \
469 ({BNWORD32 _ = (BNWORD32)(x)*(y); (pl) = _; (ph) = _>>16;})
470 #endif
471
472 #if defined(mul16_ppmm) && !defined(mul16_ppmma)
473 #define mul16_ppmma(ph,pl,x,y,a) \
474 (mul16_ppmm(ph,pl,x,y), (ph) += ((pl) += (a)) < (a))
475 #endif
476
477 #if defined(mul16_ppmma) && !defined(mul16_ppmmaa)
478 #define mul16_ppmmaa(ph,pl,x,y,a,b) \
479 (mul16_ppmma(ph,pl,x,y,a), (ph) += ((pl) += (b)) < (b))
480 #endif
481
482 /*
483 * lbnMulN1_16: Multiply an n-word input by a 1-word input and store the
484 * n+1-word product. This uses either the mul16_ppmm and mul16_ppmma
485 * macros, or C multiplication with the BNWORD32 type. This uses mul16_ppmma
486 * if available, assuming you won't bother defining it unless you can do
487 * better than the normal multiplication.
488 */
489 #ifndef lbnMulN1_16
490 #ifdef lbnMulAdd1_16 /* If we have this asm primitive, use it. */
491 void
lbnMulN1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)492 lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
493 {
494 lbnZero_16(out, len);
495 BIGLITTLE(*(out-len),*(out+len)) = lbnMulAdd1_16(out, in, len, k);
496 }
497 #elif defined(mul16_ppmm)
498 void
lbnMulN1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)499 lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
500 {
501 BNWORD16 carry, carryin;
502
503 assert(len > 0);
504
505 BIG(--out;--in;);
506 mul16_ppmm(carry, *out, *in, k);
507 LITTLE(out++;in++;)
508
509 while (--len) {
510 BIG(--out;--in;)
511 carryin = carry;
512 mul16_ppmma(carry, *out, *in, k, carryin);
513 LITTLE(out++;in++;)
514 }
515 BIGLITTLE(*--out,*out) = carry;
516 }
517 #elif defined(BNWORD32)
518 void
lbnMulN1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)519 lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
520 {
521 BNWORD32 p;
522
523 assert(len > 0);
524
525 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
526 BIGLITTLE(*--out,*out++) = (BNWORD16)p;
527
528 while (--len) {
529 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + (BNWORD16)(p >> 16);
530 BIGLITTLE(*--out,*out++) = (BNWORD16)p;
531 }
532 BIGLITTLE(*--out,*out) = (BNWORD16)(p >> 16);
533 }
534 #else
535 #error No 16x16 -> 32 multiply available for 16-bit bignum package
536 #endif
537 #endif /* lbnMulN1_16 */
538
539 /*
540 * lbnMulAdd1_16: Multiply an n-word input by a 1-word input and add the
541 * low n words of the product to the destination. *Returns the n+1st word
542 * of the product.* (That turns out to be more convenient than adding
543 * it into the destination and dealing with a possible unit carry out
544 * of *that*.) This uses either the mul16_ppmma and mul16_ppmmaa macros,
545 * or C multiplication with the BNWORD32 type.
546 *
547 * If you're going to write assembly primitives, this is the one to
548 * start with. It is by far the most commonly called function.
549 */
550 #ifndef lbnMulAdd1_16
551 #if defined(mul16_ppmm)
552 BNWORD16
lbnMulAdd1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)553 lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
554 {
555 BNWORD16 prod, carry, carryin;
556
557 assert(len > 0);
558
559 BIG(--out;--in;);
560 carryin = *out;
561 mul16_ppmma(carry, *out, *in, k, carryin);
562 LITTLE(out++;in++;)
563
564 while (--len) {
565 BIG(--out;--in;);
566 carryin = carry;
567 mul16_ppmmaa(carry, prod, *in, k, carryin, *out);
568 *out = prod;
569 LITTLE(out++;in++;)
570 }
571
572 return carry;
573 }
574 #elif defined(BNWORD32)
575 BNWORD16
lbnMulAdd1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)576 lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
577 {
578 BNWORD32 p;
579
580 assert(len > 0);
581
582 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + BIGLITTLE(*--out,*out);
583 BIGLITTLE(*out,*out++) = (BNWORD16)p;
584
585 while (--len) {
586 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k +
587 (BNWORD16)(p >> 16) + BIGLITTLE(*--out,*out);
588 BIGLITTLE(*out,*out++) = (BNWORD16)p;
589 }
590
591 return (BNWORD16)(p >> 16);
592 }
593 #else
594 #error No 16x16 -> 32 multiply available for 16-bit bignum package
595 #endif
596 #endif /* lbnMulAdd1_16 */
597
598 /*
599 * lbnMulSub1_16: Multiply an n-word input by a 1-word input and subtract the
600 * n-word product from the destination. Returns the n+1st word of the product.
601 * This uses either the mul16_ppmm and mul16_ppmma macros, or
602 * C multiplication with the BNWORD32 type.
603 *
604 * This is rather uglier than adding, but fortunately it's only used in
605 * division which is not used too heavily.
606 */
607 #ifndef lbnMulSub1_16
608 #if defined(mul16_ppmm)
609 BNWORD16
lbnMulSub1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)610 lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
611 {
612 BNWORD16 prod, carry, carryin;
613
614 assert(len > 0);
615
616 BIG(--in;)
617 mul16_ppmm(carry, prod, *in, k);
618 LITTLE(in++;)
619 carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
620
621 while (--len) {
622 BIG(--in;);
623 carryin = carry;
624 mul16_ppmma(carry, prod, *in, k, carryin);
625 LITTLE(in++;)
626 carry += (BIGLITTLE(*--out,*out++) -= prod) > (BNWORD16)~prod;
627 }
628
629 return carry;
630 }
631 #elif defined(BNWORD32)
632 BNWORD16
lbnMulSub1_16(BNWORD16 * out,BNWORD16 const * in,unsigned len,BNWORD16 k)633 lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k)
634 {
635 BNWORD32 p;
636 BNWORD16 carry, t;
637
638 assert(len > 0);
639
640 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k;
641 t = BIGLITTLE(*--out,*out);
642 carry = (BNWORD16)(p>>16) + ((BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t);
643
644 while (--len) {
645 p = (BNWORD32)BIGLITTLE(*--in,*in++) * k + carry;
646 t = BIGLITTLE(*--out,*out);
647 carry = (BNWORD16)(p>>16) +
648 ( (BIGLITTLE(*out,*out++)=t-(BNWORD16)p) > t );
649 }
650
651 return carry;
652 }
653 #else
654 #error No 16x16 -> 32 multiply available for 16-bit bignum package
655 #endif
656 #endif /* !lbnMulSub1_16 */
657
658 /*
659 * Shift n words left "shift" bits. 0 < shift < 16. Returns the
660 * carry, any bits shifted off the left-hand side (0 <= carry < 2^shift).
661 */
662 #ifndef lbnLshift_16
663 BNWORD16
lbnLshift_16(BNWORD16 * num,unsigned len,unsigned shift)664 lbnLshift_16(BNWORD16 *num, unsigned len, unsigned shift)
665 {
666 BNWORD16 x, carry;
667
668 assert(shift > 0);
669 assert(shift < 16);
670
671 carry = 0;
672 while (len--) {
673 BIG(--num;)
674 x = *num;
675 *num = (x<<shift) | carry;
676 LITTLE(num++;)
677 carry = x >> (16-shift);
678 }
679 return carry;
680 }
681 #endif /* !lbnLshift_16 */
682
683 /*
684 * An optimized version of the above, for shifts of 1.
685 * Some machines can use add-with-carry tricks for this.
686 */
687 #ifndef lbnDouble_16
688 BNWORD16
lbnDouble_16(BNWORD16 * num,unsigned len)689 lbnDouble_16(BNWORD16 *num, unsigned len)
690 {
691 BNWORD16 x, carry;
692
693 carry = 0;
694 while (len--) {
695 BIG(--num;)
696 x = *num;
697 *num = (x<<1) | carry;
698 LITTLE(num++;)
699 carry = x >> (16-1);
700 }
701 return carry;
702 }
703 #endif /* !lbnDouble_16 */
704
705 /*
706 * Shift n words right "shift" bits. 0 < shift < 16. Returns the
707 * carry, any bits shifted off the right-hand side (0 <= carry < 2^shift).
708 */
709 #ifndef lbnRshift_16
710 BNWORD16
lbnRshift_16(BNWORD16 * num,unsigned len,unsigned shift)711 lbnRshift_16(BNWORD16 *num, unsigned len, unsigned shift)
712 {
713 BNWORD16 x, carry = 0;
714
715 assert(shift > 0);
716 assert(shift < 16);
717
718 BIGLITTLE(num -= len, num += len);
719
720 while (len--) {
721 LITTLE(--num;)
722 x = *num;
723 *num = (x>>shift) | carry;
724 BIG(num++;)
725 carry = x << (16-shift);
726 }
727 return carry >> (16-shift);
728 }
729 #endif /* !lbnRshift_16 */
730
731 /*
732 * Multiply two numbers of the given lengths. prod and num2 may overlap,
733 * provided that the low len1 bits of prod are free. (This corresponds
734 * nicely to the place the result is returned from lbnMontReduce_16.)
735 *
736 * TODO: Use Karatsuba multiply. The overlap constraints may have
737 * to get rewhacked.
738 */
739 #ifndef lbnMul_16
740 void
lbnMul_16(BNWORD16 * prod,BNWORD16 const * num1,unsigned len1,BNWORD16 const * num2,unsigned len2)741 lbnMul_16(BNWORD16 *prod, BNWORD16 const *num1, unsigned len1,
742 BNWORD16 const *num2, unsigned len2)
743 {
744 /* Special case of zero */
745 if (!len1 || !len2) {
746 lbnZero_16(prod, len1+len2);
747 return;
748 }
749
750 /* Multiply first word */
751 lbnMulN1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
752
753 /*
754 * Add in subsequent words, storing the most significant word,
755 * which is new each time.
756 */
757 while (--len2) {
758 BIGLITTLE(--prod,prod++);
759 BIGLITTLE(*(prod-len1-1),*(prod+len1)) =
760 lbnMulAdd1_16(prod, num1, len1, BIGLITTLE(*--num2,*num2++));
761 }
762 }
763 #endif /* !lbnMul_16 */
764
765 /*
766 * lbnMulX_16 is a square multiply - both inputs are the same length.
767 * It's normally just a macro wrapper around the general multiply,
768 * but might be implementable in assembly more efficiently (such as
769 * when product scanning).
770 */
771 #ifndef lbnMulX_16
772 #if defined(BNWORD32) && PRODUCT_SCAN
773 /*
774 * Test code to see whether product scanning is any faster. It seems
775 * to make the C code slower, so PRODUCT_SCAN is not defined.
776 */
777 static void
lbnMulX_16(BNWORD16 * prod,BNWORD16 const * num1,BNWORD16 const * num2,unsigned len)778 lbnMulX_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
779 unsigned len)
780 {
781 BNWORD32 x, y;
782 BNWORD16 const *p1, *p2;
783 unsigned carry;
784 unsigned i, j;
785
786 /* Special case of zero */
787 if (!len)
788 return;
789
790 x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
791 BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
792 x >>= 16;
793
794 for (i = 1; i < len; i++) {
795 carry = 0;
796 p1 = num1;
797 p2 = BIGLITTLE(num2-i-1,num2+i+1);
798 for (j = 0; j <= i; j++) {
799 BIG(y = (BNWORD32)*--p1 * *p2++;)
800 LITTLE(y = (BNWORD32)*p1++ * *--p2;)
801 x += y;
802 carry += (x < y);
803 }
804 BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
805 x = (x >> 16) | (BNWORD32)carry << 16;
806 }
807 for (i = 1; i < len; i++) {
808 carry = 0;
809 p1 = BIGLITTLE(num1-i,num1+i);
810 p2 = BIGLITTLE(num2-len,num2+len);
811 for (j = i; j < len; j++) {
812 BIG(y = (BNWORD32)*--p1 * *p2++;)
813 LITTLE(y = (BNWORD32)*p1++ * *--p2;)
814 x += y;
815 carry += (x < y);
816 }
817 BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
818 x = (x >> 16) | (BNWORD32)carry << 16;
819 }
820
821 BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
822 }
823 #else /* !defined(BNWORD32) || !PRODUCT_SCAN */
824 /* Default trivial macro definition */
825 #define lbnMulX_16(prod, num1, num2, len) lbnMul_16(prod, num1, len, num2, len)
826 #endif /* !defined(BNWORD32) || !PRODUCT_SCAN */
827 #endif /* !lbmMulX_16 */
828
829 #if !defined(lbnMontMul_16) && defined(BNWORD32) && PRODUCT_SCAN
830 /*
831 * Test code for product-scanning multiply. This seems to slow the C
832 * code down rather than speed it up.
833 * This does a multiply and Montgomery reduction together, using the
834 * same loops. The outer loop scans across the product, twice.
835 * The first pass computes the low half of the product and the
836 * Montgomery multipliers. These are stored in the product array,
837 * which contains no data as of yet. x and carry add up the columns
838 * and propagate carries forward.
839 *
840 * The second half multiplies the upper half, adding in the modulus
841 * times the Montgomery multipliers. The results of this multiply
842 * are stored.
843 */
844 static void
lbnMontMul_16(BNWORD16 * prod,BNWORD16 const * num1,BNWORD16 const * num2,BNWORD16 const * mod,unsigned len,BNWORD16 inv)845 lbnMontMul_16(BNWORD16 *prod, BNWORD16 const *num1, BNWORD16 const *num2,
846 BNWORD16 const *mod, unsigned len, BNWORD16 inv)
847 {
848 BNWORD32 x, y;
849 BNWORD16 const *p1, *p2, *pm;
850 BNWORD16 *pp;
851 BNWORD16 t;
852 unsigned carry;
853 unsigned i, j;
854
855 /* Special case of zero */
856 if (!len)
857 return;
858
859 /*
860 * This computes directly into the high half of prod, so just
861 * shift the pointer and consider prod only "len" elements long
862 * for the rest of the code.
863 */
864 BIGLITTLE(prod -= len, prod += len);
865
866 /* Pass 1 - compute Montgomery multipliers */
867 /* First iteration can have certain simplifications. */
868 x = (BNWORD32)BIGLITTLE(num1[-1] * num2[-1], num1[0] * num2[0]);
869 BIGLITTLE(prod[-1], prod[0]) = t = inv * (BNWORD16)x;
870 y = (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]);
871 x += y;
872 /* Note: GCC 2.6.3 has a bug if you try to eliminate "carry" */
873 carry = (x < y);
874 assert((BNWORD16)x == 0);
875 x = x >> 16 | (BNWORD32)carry << 16;
876
877 for (i = 1; i < len; i++) {
878 carry = 0;
879 p1 = num1;
880 p2 = BIGLITTLE(num2-i-1,num2+i+1);
881 pp = prod;
882 pm = BIGLITTLE(mod-i-1,mod+i+1);
883 for (j = 0; j < i; j++) {
884 y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
885 x += y;
886 carry += (x < y);
887 y = (BNWORD32)BIGLITTLE(*--pp * *pm++, *pp++ * *--pm);
888 x += y;
889 carry += (x < y);
890 }
891 y = (BNWORD32)BIGLITTLE(p1[-1] * p2[0], p1[0] * p2[-1]);
892 x += y;
893 carry += (x < y);
894 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
895 BIGLITTLE(pp[-1], pp[0]) = t = inv * (BNWORD16)x;
896 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
897 y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
898 x += y;
899 carry += (x < y);
900 assert((BNWORD16)x == 0);
901 x = x >> 16 | (BNWORD32)carry << 16;
902 }
903
904 /* Pass 2 - compute reduced product and store */
905 for (i = 1; i < len; i++) {
906 carry = 0;
907 p1 = BIGLITTLE(num1-i,num1+i);
908 p2 = BIGLITTLE(num2-len,num2+len);
909 pm = BIGLITTLE(mod-i,mod+i);
910 pp = BIGLITTLE(prod-len,prod+len);
911 for (j = i; j < len; j++) {
912 y = (BNWORD32)BIGLITTLE(*--p1 * *p2++, *p1++ * *--p2);
913 x += y;
914 carry += (x < y);
915 y = (BNWORD32)BIGLITTLE(*--pm * *pp++, *pm++ * *--pp);
916 x += y;
917 carry += (x < y);
918 }
919 assert(BIGLITTLE(pm == mod-len, pm == mod+len));
920 assert(BIGLITTLE(pp == prod-i, pp == prod+i));
921 BIGLITTLE(pp[0],pp[-1]) = (BNWORD16)x;
922 x = (x >> 16) | (BNWORD32)carry << 16;
923 }
924
925 /* Last round of second half, simplified. */
926 BIGLITTLE(*(prod-len),*(prod+len-1)) = (BNWORD16)x;
927 carry = (x >> 16);
928
929 while (carry)
930 carry -= lbnSubN_16(prod, mod, len);
931 while (lbnCmp_16(prod, mod, len) >= 0)
932 (void)lbnSubN_16(prod, mod, len);
933 }
934 /* Suppress later definition */
935 #define lbnMontMul_16 lbnMontMul_16
936 #endif
937
938 #if !defined(lbnSquare_16) && defined(BNWORD32) && PRODUCT_SCAN
939 /*
940 * Trial code for product-scanning squaring. This seems to slow the C
941 * code down rather than speed it up.
942 */
943 void
lbnSquare_16(BNWORD16 * prod,BNWORD16 const * num,unsigned len)944 lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
945 {
946 BNWORD32 x, y, z;
947 BNWORD16 const *p1, *p2;
948 unsigned carry;
949 unsigned i, j;
950
951 /* Special case of zero */
952 if (!len)
953 return;
954
955 /* Word 0 of product */
956 x = (BNWORD32)BIGLITTLE(num[-1] * num[-1], num[0] * num[0]);
957 BIGLITTLE(*--prod, *prod++) = (BNWORD16)x;
958 x >>= 16;
959
960 /* Words 1 through len-1 */
961 for (i = 1; i < len; i++) {
962 carry = 0;
963 y = 0;
964 p1 = num;
965 p2 = BIGLITTLE(num-i-1,num+i+1);
966 for (j = 0; j < (i+1)/2; j++) {
967 BIG(z = (BNWORD32)*--p1 * *p2++;)
968 LITTLE(z = (BNWORD32)*p1++ * *--p2;)
969 y += z;
970 carry += (y < z);
971 }
972 y += z = y;
973 carry += carry + (y < z);
974 if ((i & 1) == 0) {
975 assert(BIGLITTLE(--p1 == p2, p1 == --p2));
976 BIG(z = (BNWORD32)*p2 * *p2;)
977 LITTLE(z = (BNWORD32)*p1 * *p1;)
978 y += z;
979 carry += (y < z);
980 }
981 x += y;
982 carry += (x < y);
983 BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
984 x = (x >> 16) | (BNWORD32)carry << 16;
985 }
986 /* Words len through 2*len-2 */
987 for (i = 1; i < len; i++) {
988 carry = 0;
989 y = 0;
990 p1 = BIGLITTLE(num-i,num+i);
991 p2 = BIGLITTLE(num-len,num+len);
992 for (j = 0; j < (len-i)/2; j++) {
993 BIG(z = (BNWORD32)*--p1 * *p2++;)
994 LITTLE(z = (BNWORD32)*p1++ * *--p2;)
995 y += z;
996 carry += (y < z);
997 }
998 y += z = y;
999 carry += carry + (y < z);
1000 if ((len-i) & 1) {
1001 assert(BIGLITTLE(--p1 == p2, p1 == --p2));
1002 BIG(z = (BNWORD32)*p2 * *p2;)
1003 LITTLE(z = (BNWORD32)*p1 * *p1;)
1004 y += z;
1005 carry += (y < z);
1006 }
1007 x += y;
1008 carry += (x < y);
1009 BIGLITTLE(*--prod,*prod++) = (BNWORD16)x;
1010 x = (x >> 16) | (BNWORD32)carry << 16;
1011 }
1012
1013 /* Word 2*len-1 */
1014 BIGLITTLE(*--prod,*prod) = (BNWORD16)x;
1015 }
1016 /* Suppress later definition */
1017 #define lbnSquare_16 lbnSquare_16
1018 #endif
1019
1020 /*
1021 * Square a number, using optimized squaring to reduce the number of
1022 * primitive multiples that are executed. There may not be any
1023 * overlap of the input and output.
1024 *
1025 * Technique: Consider the partial products in the multiplication
1026 * of "abcde" by itself:
1027 *
1028 * a b c d e
1029 * * a b c d e
1030 * ==================
1031 * ae be ce de ee
1032 * ad bd cd dd de
1033 * ac bc cc cd ce
1034 * ab bb bc bd be
1035 * aa ab ac ad ae
1036 *
1037 * Note that everything above the main diagonal:
1038 * ae be ce de = (abcd) * e
1039 * ad bd cd = (abc) * d
1040 * ac bc = (ab) * c
1041 * ab = (a) * b
1042 *
1043 * is a copy of everything below the main diagonal:
1044 * de
1045 * cd ce
1046 * bc bd be
1047 * ab ac ad ae
1048 *
1049 * Thus, the sum is 2 * (off the diagonal) + diagonal.
1050 *
1051 * This is accumulated beginning with the diagonal (which
1052 * consist of the squares of the digits of the input), which is then
1053 * divided by two, the off-diagonal added, and multiplied by two
1054 * again. The low bit is simply a copy of the low bit of the
1055 * input, so it doesn't need special care.
1056 *
1057 * TODO: Merge the shift by 1 with the squaring loop.
1058 * TODO: Use Karatsuba. (a*W+b)^2 = a^2 * (W^2+W) + b^2 * (W+1) - (a-b)^2 * W.
1059 */
1060 #ifndef lbnSquare_16
1061 void
lbnSquare_16(BNWORD16 * prod,BNWORD16 const * num,unsigned len)1062 lbnSquare_16(BNWORD16 *prod, BNWORD16 const *num, unsigned len)
1063 {
1064 BNWORD16 t;
1065 BNWORD16 *prodx = prod; /* Working copy of the argument */
1066 BNWORD16 const *numx = num; /* Working copy of the argument */
1067 unsigned lenx = len; /* Working copy of the argument */
1068
1069 if (!len)
1070 return;
1071
1072 /* First, store all the squares */
1073 while (lenx--) {
1074 #ifdef mul16_ppmm
1075 BNWORD16 ph, pl;
1076 t = BIGLITTLE(*--numx,*numx++);
1077 mul16_ppmm(ph,pl,t,t);
1078 BIGLITTLE(*--prodx,*prodx++) = pl;
1079 BIGLITTLE(*--prodx,*prodx++) = ph;
1080 #elif defined(BNWORD32) /* use BNWORD32 */
1081 BNWORD32 p;
1082 t = BIGLITTLE(*--numx,*numx++);
1083 p = (BNWORD32)t * t;
1084 BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)p;
1085 BIGLITTLE(*--prodx,*prodx++) = (BNWORD16)(p>>16);
1086 #else /* Use lbnMulN1_16 */
1087 t = BIGLITTLE(numx[-1],*numx);
1088 lbnMulN1_16(prodx, numx, 1, t);
1089 BIGLITTLE(--numx,numx++);
1090 BIGLITTLE(prodx -= 2, prodx += 2);
1091 #endif
1092 }
1093 /* Then, shift right 1 bit */
1094 (void)lbnRshift_16(prod, 2*len, 1);
1095
1096 /* Then, add in the off-diagonal sums */
1097 lenx = len;
1098 numx = num;
1099 prodx = prod;
1100 while (--lenx) {
1101 t = BIGLITTLE(*--numx,*numx++);
1102 BIGLITTLE(--prodx,prodx++);
1103 t = lbnMulAdd1_16(prodx, numx, lenx, t);
1104 lbnAdd1_16(BIGLITTLE(prodx-lenx,prodx+lenx), lenx+1, t);
1105 BIGLITTLE(--prodx,prodx++);
1106 }
1107
1108 /* Shift it back up */
1109 lbnDouble_16(prod, 2*len);
1110
1111 /* And set the low bit appropriately */
1112 BIGLITTLE(prod[-1],prod[0]) |= BIGLITTLE(num[-1],num[0]) & 1;
1113 }
1114 #endif /* !lbnSquare_16 */
1115
1116 /*
1117 * lbnNorm_16 - given a number, return a modified length such that the
1118 * most significant digit is non-zero. Zero-length input is okay.
1119 */
1120 #ifndef lbnNorm_16
1121 unsigned
lbnNorm_16(BNWORD16 const * num,unsigned len)1122 lbnNorm_16(BNWORD16 const *num, unsigned len)
1123 {
1124 BIGLITTLE(num -= len,num += len);
1125 while (len && BIGLITTLE(*num++,*--num) == 0)
1126 --len;
1127 return len;
1128 }
1129 #endif /* lbnNorm_16 */
1130
1131 /*
1132 * lbnBits_16 - return the number of significant bits in the array.
1133 * It starts by normalizing the array. Zero-length input is okay.
1134 * Then assuming there's anything to it, it fetches the high word,
1135 * generates a bit length by multiplying the word length by 16, and
1136 * subtracts off 16/2, 16/4, 16/8, ... bits if the high bits are clear.
1137 */
1138 #ifndef lbnBits_16
1139 unsigned
lbnBits_16(BNWORD16 const * num,unsigned len)1140 lbnBits_16(BNWORD16 const *num, unsigned len)
1141 {
1142 BNWORD16 t;
1143 unsigned i;
1144
1145 len = lbnNorm_16(num, len);
1146 if (len) {
1147 t = BIGLITTLE(*(num-len),*(num+(len-1)));
1148 assert(t);
1149 len *= 16;
1150 i = 16/2;
1151 do {
1152 if (t >> i)
1153 t >>= i;
1154 else
1155 len -= i;
1156 } while ((i /= 2) != 0);
1157 }
1158 return len;
1159 }
1160 #endif /* lbnBits_16 */
1161
1162 /*
1163 * If defined, use hand-rolled divide rather than compiler's native.
1164 * If the machine doesn't do it in line, the manual code is probably
1165 * faster, since it can assume normalization and the fact that the
1166 * quotient will fit into 16 bits, which a general 32-bit divide
1167 * in a compiler's run-time library can't do.
1168 */
1169 #ifndef BN_SLOW_DIVIDE_32
1170 /* Assume that divisors of more than thirty-two bits are slow */
1171 #define BN_SLOW_DIVIDE_32 (32 > 0x20)
1172 #endif
1173
1174 /*
1175 * Return (nh<<16|nl) % d, and place the quotient digit into *q.
1176 * It is guaranteed that nh < d, and that d is normalized (with its high
1177 * bit set). If we have a double-width type, it's easy. If not, ooh,
1178 * yuk!
1179 */
1180 #ifndef lbnDiv21_16
1181 #if defined(BNWORD32) && !BN_SLOW_DIVIDE_32
1182 BNWORD16
lbnDiv21_16(BNWORD16 * q,BNWORD16 nh,BNWORD16 nl,BNWORD16 d)1183 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
1184 {
1185 BNWORD32 n = (BNWORD32)nh << 16 | nl;
1186
1187 /* Divisor must be normalized */
1188 assert(d >> (16-1) == 1);
1189
1190 *q = n / d;
1191 return n % d;
1192 }
1193 #else
1194 /*
1195 * This is where it gets ugly.
1196 *
1197 * Do the division in two halves, using Algorithm D from section 4.3.1
1198 * of Knuth. Note Theorem B from that section, that the quotient estimate
1199 * is never more than the true quotient, and is never more than two
1200 * too low.
1201 *
1202 * The mapping onto conventional long division is (everything a half word):
1203 * _____________qh___ql_
1204 * dh dl ) nh.h nh.l nl.h nl.l
1205 * - (qh * d)
1206 * -----------
1207 * rrrr rrrr nl.l
1208 * - (ql * d)
1209 * -----------
1210 * rrrr rrrr
1211 *
1212 * The implicit 3/2-digit d*qh and d*ql subtractors are computed this way:
1213 * First, estimate a q digit so that nh/dh works. Subtracting qh*dh from
1214 * the (nh.h nh.l) list leaves a 1/2-word remainder r. Then compute the
1215 * low part of the subtractor, qh * dl. This also needs to be subtracted
1216 * from (nh.h nh.l nl.h) to get the final remainder. So we take the
1217 * remainder, which is (nh.h nh.l) - qh*dl, shift it and add in nl.h, and
1218 * try to subtract qh * dl from that. Since the remainder is 1/2-word
1219 * long, shifting and adding nl.h results in a single word r.
1220 * It is possible that the remainder we're working with, r, is less than
1221 * the product qh * dl, if we estimated qh too high. The estimation
1222 * technique can produce a qh that is too large (never too small), leading
1223 * to r which is too small. In that case, decrement the digit qh, add
1224 * shifted dh to r (to correct for that error), and subtract dl from the
1225 * product we're comparing r with. That's the "correct" way to do it, but
1226 * just adding dl to r instead of subtracting it from the product is
1227 * equivalent and a lot simpler. You just have to watch out for overflow.
1228 *
1229 * The process is repeated with (rrrr rrrr nl.l) for the low digit of the
1230 * quotient ql.
1231 *
1232 * The various uses of 16/2 for shifts are because of the note about
1233 * automatic editing of this file at the very top of the file.
1234 */
1235 #define highhalf(x) ( (x) >> 16/2 )
1236 #define lowhalf(x) ( (x) & (((BNWORD16)1 << 16/2)-1) )
1237 BNWORD16
lbnDiv21_16(BNWORD16 * q,BNWORD16 nh,BNWORD16 nl,BNWORD16 d)1238 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d)
1239 {
1240 BNWORD16 dh = highhalf(d), dl = lowhalf(d);
1241 BNWORD16 qh, ql, prod, r;
1242
1243 /* Divisor must be normalized */
1244 assert((d >> (16-1)) == 1);
1245
1246 /* Do first half-word of division */
1247 qh = nh / dh;
1248 r = nh % dh;
1249 prod = qh * dl;
1250
1251 /*
1252 * Add next half-word of numerator to remainder and correct.
1253 * qh may be up to two too large.
1254 */
1255 r = (r << (16/2)) | highhalf(nl);
1256 if (r < prod) {
1257 --qh; r += d;
1258 if (r >= d && r < prod) {
1259 --qh; r += d;
1260 }
1261 }
1262 r -= prod;
1263
1264 /* Do second half-word of division */
1265 ql = r / dh;
1266 r = r % dh;
1267 prod = ql * dl;
1268
1269 r = (r << (16/2)) | lowhalf(nl);
1270 if (r < prod) {
1271 --ql; r += d;
1272 if (r >= d && r < prod) {
1273 --ql; r += d;
1274 }
1275 }
1276 r -= prod;
1277
1278 *q = (qh << (16/2)) | ql;
1279
1280 return r;
1281 }
1282 #endif
1283 #endif /* lbnDiv21_16 */
1284
1285
1286 /*
1287 * In the division functions, the dividend and divisor are referred to
1288 * as "n" and "d", which stand for "numerator" and "denominator".
1289 *
1290 * The quotient is (nlen-dlen+1) digits long. It may be overlapped with
1291 * the high (nlen-dlen) words of the dividend, but one extra word is needed
1292 * on top to hold the top word.
1293 */
1294
1295 /*
1296 * Divide an n-word number by a 1-word number, storing the remainder
1297 * and n-1 words of the n-word quotient. The high word is returned.
1298 * It IS legal for rem to point to the same address as n, and for
1299 * q to point one word higher.
1300 *
1301 * TODO: If BN_SLOW_DIVIDE_32, add a divnhalf_16 which uses 16-bit
1302 * dividends if the divisor is half that long.
1303 * TODO: Shift the dividend on the fly to avoid the last division and
1304 * instead have a remainder that needs shifting.
1305 * TODO: Use reciprocals rather than dividing.
1306 */
1307 #ifndef lbnDiv1_16
1308 BNWORD16
lbnDiv1_16(BNWORD16 * q,BNWORD16 * rem,BNWORD16 const * n,unsigned len,BNWORD16 d)1309 lbnDiv1_16(BNWORD16 *q, BNWORD16 *rem, BNWORD16 const *n, unsigned len,
1310 BNWORD16 d)
1311 {
1312 unsigned shift;
1313 unsigned xlen;
1314 BNWORD16 r;
1315 BNWORD16 qhigh;
1316
1317 assert(len > 0);
1318 assert(d);
1319
1320 if (len == 1) {
1321 r = *n;
1322 *rem = r%d;
1323 return r/d;
1324 }
1325
1326 shift = 0;
1327 r = d;
1328 xlen = 16/2;
1329 do {
1330 if (r >> xlen)
1331 r >>= xlen;
1332 else
1333 shift += xlen;
1334 } while ((xlen /= 2) != 0);
1335 assert((d >> (16-1-shift)) == 1);
1336 d <<= shift;
1337
1338 BIGLITTLE(q -= len-1,q += len-1);
1339 BIGLITTLE(n -= len,n += len);
1340
1341 r = BIGLITTLE(*n++,*--n);
1342 if (r < d) {
1343 qhigh = 0;
1344 } else {
1345 qhigh = r/d;
1346 r %= d;
1347 }
1348
1349 xlen = len;
1350 while (--xlen)
1351 r = lbnDiv21_16(BIGLITTLE(q++,--q), r, BIGLITTLE(*n++,*--n), d);
1352
1353 /*
1354 * Final correction for shift - shift the quotient up "shift"
1355 * bits, and merge in the extra bits of quotient. Then reduce
1356 * the final remainder mod the real d.
1357 */
1358 if (shift) {
1359 d >>= shift;
1360 qhigh = (qhigh << shift) | lbnLshift_16(q, len-1, shift);
1361 BIGLITTLE(q[-1],*q) |= r/d;
1362 r %= d;
1363 }
1364 *rem = r;
1365
1366 return qhigh;
1367 }
1368 #endif
1369
1370 /*
1371 * This function performs a "quick" modulus of a number with a divisor
1372 * d which is guaranteed to be at most sixteen bits, i.e. less than 65536.
1373 * This applies regardless of the word size the library is compiled with.
1374 *
1375 * This function is important to prime generation, for sieving.
1376 */
1377 #ifndef lbnModQ_16
1378 /* If there's a custom lbnMod21_16, no normalization needed */
1379 #ifdef lbnMod21_16
1380 unsigned
lbnModQ_16(BNWORD16 const * n,unsigned len,unsigned d)1381 lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
1382 {
1383 unsigned i, shift;
1384 BNWORD16 r;
1385
1386 assert(len > 0);
1387
1388 BIGLITTLE(n -= len,n += len);
1389
1390 /* Try using a compare to avoid the first divide */
1391 r = BIGLITTLE(*n++,*--n);
1392 if (r >= d)
1393 r %= d;
1394 while (--len)
1395 r = lbnMod21_16(r, BIGLITTLE(*n++,*--n), d);
1396
1397 return r;
1398 }
1399 #elif defined(BNWORD32) && !BN_SLOW_DIVIDE_32
1400 unsigned
lbnModQ_16(BNWORD16 const * n,unsigned len,unsigned d)1401 lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
1402 {
1403 BNWORD16 r;
1404
1405 if (!--len)
1406 return BIGLITTLE(n[-1],n[0]) % d;
1407
1408 BIGLITTLE(n -= len,n += len);
1409 r = BIGLITTLE(n[-1],n[0]);
1410
1411 do {
1412 r = (BNWORD16)((((BNWORD32)r<<16) | BIGLITTLE(*n++,*--n)) % d);
1413 } while (--len);
1414
1415 return r;
1416 }
1417 #elif 16 >= 0x20
1418 /*
1419 * If the single word size can hold 65535*65536, then this function
1420 * is avilable.
1421 */
1422 #ifndef highhalf
1423 #define highhalf(x) ( (x) >> 16/2 )
1424 #define lowhalf(x) ( (x) & ((1 << 16/2)-1) )
1425 #endif
1426 unsigned
lbnModQ_16(BNWORD16 const * n,unsigned len,unsigned d)1427 lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
1428 {
1429 BNWORD16 r, x;
1430
1431 BIGLITTLE(n -= len,n += len);
1432
1433 r = BIGLITTLE(*n++,*--n);
1434 while (--len) {
1435 x = BIGLITTLE(*n++,*--n);
1436 r = (r%d << 16/2) | highhalf(x);
1437 r = (r%d << 16/2) | lowhalf(x);
1438 }
1439
1440 return r%d;
1441 }
1442 #else
1443 /* Default case - use lbnDiv21_16 */
1444 unsigned
lbnModQ_16(BNWORD16 const * n,unsigned len,unsigned d)1445 lbnModQ_16(BNWORD16 const *n, unsigned len, unsigned d)
1446 {
1447 unsigned i, shift;
1448 BNWORD16 r;
1449 BNWORD16 q;
1450
1451 assert(len > 0);
1452
1453 shift = 0;
1454 r = d;
1455 i = 16;
1456 while (i /= 2) {
1457 if (r >> i)
1458 r >>= i;
1459 else
1460 shift += i;
1461 }
1462 assert(d >> (16-1-shift) == 1);
1463 d <<= shift;
1464
1465 BIGLITTLE(n -= len,n += len);
1466
1467 r = BIGLITTLE(*n++,*--n);
1468 if (r >= d)
1469 r %= d;
1470
1471 while (--len)
1472 r = lbnDiv21_16(&q, r, BIGLITTLE(*n++,*--n), d);
1473
1474 /*
1475 * Final correction for shift - shift the quotient up "shift"
1476 * bits, and merge in the extra bits of quotient. Then reduce
1477 * the final remainder mod the real d.
1478 */
1479 if (shift)
1480 r %= d >> shift;
1481
1482 return r;
1483 }
1484 #endif
1485 #endif /* lbnModQ_16 */
1486
1487 /*
1488 * Reduce n mod d and return the quotient. That is, find:
1489 * q = n / d;
1490 * n = n % d;
1491 * d is altered during the execution of this subroutine by normalizing it.
1492 * It must already have its most significant word non-zero; it is shifted
1493 * so its most significant bit is non-zero.
1494 *
1495 * The quotient q is nlen-dlen+1 words long. To make it possible to
1496 * overlap the quptient with the input (you can store it in the high dlen
1497 * words), the high word of the quotient is *not* stored, but is returned.
1498 * (If all you want is the remainder, you don't care about it, anyway.)
1499 *
1500 * This uses algorithm D from Knuth (4.3.1), except that we do binary
1501 * (shift) normalization of the divisor. WARNING: This is hairy!
1502 *
1503 * This function is used for some modular reduction, but it is not used in
1504 * the modular exponentiation loops; they use Montgomery form and the
1505 * corresponding, more efficient, Montgomery reduction. This code
1506 * is needed for the conversion to Montgomery form, however, so it
1507 * has to be here and it might as well be reasonably efficient.
1508 *
1509 * The overall operation is as follows ("top" and "up" refer to the
1510 * most significant end of the number; "bottom" and "down", the least):
1511 *
1512 * - Shift the divisor up until the most significant bit is set.
1513 * - Shift the dividend up the same amount. This will produce the
1514 * correct quotient, and the remainder can be recovered by shifting
1515 * it back down the same number of bits. This may produce an overflow
1516 * word, but the word is always strictly less than the most significant
1517 * divisor word.
1518 * - Estimate the first quotient digit qhat:
1519 * - First take the top two words (one of which is the overflow) of the
1520 * dividend and divide by the top word of the divisor:
1521 * qhat = (nh,nm)/dh. This qhat is >= the correct quotient digit
1522 * and, since dh is normalized, it is at most two over.
1523 * - Second, correct by comparing the top three words. If
1524 * (dh,dl) * qhat > (nh,nm,ml), decrease qhat and try again.
1525 * The second iteration can be simpler because there can't be a third.
1526 * The computation can be simplified by subtracting dh*qhat from
1527 * both sides, suitably shifted. This reduces the left side to
1528 * dl*qhat. On the right, (nh,nm)-dh*qhat is simply the
1529 * remainder r from (nh,nm)%dh, so the right is (r,nl).
1530 * This produces qhat that is almost always correct and at
1531 * most (prob ~ 2/2^16) one too high.
1532 * - Subtract qhat times the divisor (suitably shifted) from the dividend.
1533 * If there is a borrow, qhat was wrong, so decrement it
1534 * and add the divisor back in (once).
1535 * - Store the final quotient digit qhat in the quotient array q.
1536 *
1537 * Repeat the quotient digit computation for successive digits of the
1538 * quotient until the whole quotient has been computed. Then shift the
1539 * divisor and the remainder down to correct for the normalization.
1540 *
1541 * TODO: Special case 2-word divisors.
1542 * TODO: Use reciprocals rather than dividing.
1543 */
1544 #ifndef divn_16
1545 BNWORD16
lbnDiv_16(BNWORD16 * q,BNWORD16 * n,unsigned nlen,BNWORD16 * d,unsigned dlen)1546 lbnDiv_16(BNWORD16 *q, BNWORD16 *n, unsigned nlen, BNWORD16 *d, unsigned dlen)
1547 {
1548 BNWORD16 nh,nm,nl; /* Top three words of the dividend */
1549 BNWORD16 dh,dl; /* Top two words of the divisor */
1550 BNWORD16 qhat; /* Extimate of quotient word */
1551 BNWORD16 r; /* Remainder from quotient estimate division */
1552 BNWORD16 qhigh; /* High word of quotient */
1553 unsigned i; /* Temp */
1554 unsigned shift; /* Bits shifted by normalization */
1555 unsigned qlen = nlen-dlen; /* Size of quotient (less 1) */
1556 #ifdef mul16_ppmm
1557 BNWORD16 t16;
1558 #elif defined(BNWORD32)
1559 BNWORD32 t32;
1560 #else /* use lbnMulN1_16 */
1561 BNWORD16 t2[2];
1562 #define t2high BIGLITTLE(t2[0],t2[1])
1563 #define t2low BIGLITTLE(t2[1],t2[0])
1564 #endif
1565
1566 assert(dlen);
1567 assert(nlen >= dlen);
1568
1569 /*
1570 * Special cases for short divisors. The general case uses the
1571 * top top 2 digits of the divisor (d) to estimate a quotient digit,
1572 * so it breaks if there are fewer digits available. Thus, we need
1573 * special cases for a divisor of length 1. A divisor of length
1574 * 2 can have a *lot* of administrivia overhead removed removed,
1575 * so it's probably worth special-casing that case, too.
1576 */
1577 if (dlen == 1)
1578 return lbnDiv1_16(q, BIGLITTLE(n-1,n), n, nlen,
1579 BIGLITTLE(d[-1],d[0]));
1580
1581 #if 0
1582 /*
1583 * @@@ This is not yet written... The general loop will do,
1584 * albeit less efficiently
1585 */
1586 if (dlen == 2) {
1587 /*
1588 * divisor two digits long:
1589 * use the 3/2 technique from Knuth, but we know
1590 * it's exact.
1591 */
1592 dh = BIGLITTLE(d[-1],d[0]);
1593 dl = BIGLITTLE(d[-2],d[1]);
1594 shift = 0;
1595 if ((sh & ((BNWORD16)1 << 16-1-shift)) == 0) {
1596 do {
1597 shift++;
1598 } while (dh & (BNWORD16)1<<16-1-shift) == 0);
1599 dh = dh << shift | dl >> (16-shift);
1600 dl <<= shift;
1601
1602
1603 }
1604
1605
1606 for (shift = 0; (dh & (BNWORD16)1 << 16-1-shift)) == 0; shift++)
1607 ;
1608 if (shift) {
1609 }
1610 dh = dh << shift | dl >> (16-shift);
1611 shift = 0;
1612 while (dh
1613 }
1614 #endif
1615
1616 dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1617 assert(dh);
1618
1619 /* Normalize the divisor */
1620 shift = 0;
1621 r = dh;
1622 i = 16/2;
1623 do {
1624 if (r >> i)
1625 r >>= i;
1626 else
1627 shift += i;
1628 } while ((i /= 2) != 0);
1629
1630 nh = 0;
1631 if (shift) {
1632 lbnLshift_16(d, dlen, shift);
1633 dh = BIGLITTLE(*(d-dlen),*(d+(dlen-1)));
1634 nh = lbnLshift_16(n, nlen, shift);
1635 }
1636
1637 /* Assert that dh is now normalized */
1638 assert(dh >> (16-1));
1639
1640 /* Also get the second-most significant word of the divisor */
1641 dl = BIGLITTLE(*(d-(dlen-1)),*(d+(dlen-2)));
1642
1643 /*
1644 * Adjust pointers: n to point to least significant end of first
1645 * first subtract, and q to one the most-significant end of the
1646 * quotient array.
1647 */
1648 BIGLITTLE(n -= qlen,n += qlen);
1649 BIGLITTLE(q -= qlen,q += qlen);
1650
1651 /* Fetch the most significant stored word of the dividend */
1652 nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1653
1654 /*
1655 * Compute the first digit of the quotient, based on the
1656 * first two words of the dividend (the most significant of which
1657 * is the overflow word h).
1658 */
1659 if (nh) {
1660 assert(nh < dh);
1661 r = lbnDiv21_16(&qhat, nh, nm, dh);
1662 } else if (nm >= dh) {
1663 qhat = nm/dh;
1664 r = nm % dh;
1665 } else { /* Quotient is zero */
1666 qhigh = 0;
1667 goto divloop;
1668 }
1669
1670 /* Now get the third most significant word of the dividend */
1671 nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1672
1673 /*
1674 * Correct qhat, the estimate of quotient digit.
1675 * qhat can only be high, and at most two words high,
1676 * so the loop can be unrolled and abbreviated.
1677 */
1678 #ifdef mul16_ppmm
1679 mul16_ppmm(nm, t16, qhat, dl);
1680 if (nm > r || (nm == r && t16 > nl)) {
1681 /* Decrement qhat and adjust comparison parameters */
1682 qhat--;
1683 if ((r += dh) >= dh) {
1684 nm -= (t16 < dl);
1685 t16 -= dl;
1686 if (nm > r || (nm == r && t16 > nl))
1687 qhat--;
1688 }
1689 }
1690 #elif defined(BNWORD32)
1691 t32 = (BNWORD32)qhat * dl;
1692 if (t32 > ((BNWORD32)r << 16) + nl) {
1693 /* Decrement qhat and adjust comparison parameters */
1694 qhat--;
1695 if ((r += dh) > dh) {
1696 t32 -= dl;
1697 if (t32 > ((BNWORD32)r << 16) + nl)
1698 qhat--;
1699 }
1700 }
1701 #else /* Use lbnMulN1_16 */
1702 lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1703 if (t2high > r || (t2high == r && t2low > nl)) {
1704 /* Decrement qhat and adjust comparison parameters */
1705 qhat--;
1706 if ((r += dh) >= dh) {
1707 t2high -= (t2low < dl);
1708 t2low -= dl;
1709 if (t2high > r || (t2high == r && t2low > nl))
1710 qhat--;
1711 }
1712 }
1713 #endif
1714
1715 /* Do the multiply and subtract */
1716 r = lbnMulSub1_16(n, d, dlen, qhat);
1717 /* If there was a borrow, add back once. */
1718 if (r > nh) { /* Borrow? */
1719 (void)lbnAddN_16(n, d, dlen);
1720 qhat--;
1721 }
1722
1723 /* Remember the first quotient digit. */
1724 qhigh = qhat;
1725
1726 /* Now, the main division loop: */
1727 divloop:
1728 while (qlen--) {
1729
1730 /* Advance n */
1731 nh = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1732 BIGLITTLE(++n,--n);
1733 nm = BIGLITTLE(*(n-dlen),*(n+(dlen-1)));
1734
1735 if (nh == dh) {
1736 qhat = ~(BNWORD16)0;
1737 /* Optimized computation of r = (nh,nm) - qhat * dh */
1738 r = nh + nm;
1739 if (r < nh)
1740 goto subtract;
1741 } else {
1742 assert(nh < dh);
1743 r = lbnDiv21_16(&qhat, nh, nm, dh);
1744 }
1745
1746 nl = BIGLITTLE(*(n-(dlen-1)),*(n+(dlen-2)));
1747 #ifdef mul16_ppmm
1748 mul16_ppmm(nm, t16, qhat, dl);
1749 if (nm > r || (nm == r && t16 > nl)) {
1750 /* Decrement qhat and adjust comparison parameters */
1751 qhat--;
1752 if ((r += dh) >= dh) {
1753 nm -= (t16 < dl);
1754 t16 -= dl;
1755 if (nm > r || (nm == r && t16 > nl))
1756 qhat--;
1757 }
1758 }
1759 #elif defined(BNWORD32)
1760 t32 = (BNWORD32)qhat * dl;
1761 if (t32 > ((BNWORD32)r<<16) + nl) {
1762 /* Decrement qhat and adjust comparison parameters */
1763 qhat--;
1764 if ((r += dh) >= dh) {
1765 t32 -= dl;
1766 if (t32 > ((BNWORD32)r << 16) + nl)
1767 qhat--;
1768 }
1769 }
1770 #else /* Use lbnMulN1_16 */
1771 lbnMulN1_16(BIGLITTLE(t2+2,t2), &dl, 1, qhat);
1772 if (t2high > r || (t2high == r && t2low > nl)) {
1773 /* Decrement qhat and adjust comparison parameters */
1774 qhat--;
1775 if ((r += dh) >= dh) {
1776 t2high -= (t2low < dl);
1777 t2low -= dl;
1778 if (t2high > r || (t2high == r && t2low > nl))
1779 qhat--;
1780 }
1781 }
1782 #endif
1783
1784 /*
1785 * As a point of interest, note that it is not worth checking
1786 * for qhat of 0 or 1 and installing special-case code. These
1787 * occur with probability 2^-16, so spending 1 cycle to check
1788 * for them is only worth it if we save more than 2^15 cycles,
1789 * and a multiply-and-subtract for numbers in the 1024-bit
1790 * range just doesn't take that long.
1791 */
1792 subtract:
1793 /*
1794 * n points to the least significant end of the substring
1795 * of n to be subtracted from. qhat is either exact or
1796 * one too large. If the subtract gets a borrow, it was
1797 * one too large and the divisor is added back in. It's
1798 * a dlen+1 word add which is guaranteed to produce a
1799 * carry out, so it can be done very simply.
1800 */
1801 r = lbnMulSub1_16(n, d, dlen, qhat);
1802 if (r > nh) { /* Borrow? */
1803 (void)lbnAddN_16(n, d, dlen);
1804 qhat--;
1805 }
1806 /* Store the quotient digit */
1807 BIGLITTLE(*q++,*--q) = qhat;
1808 }
1809 /* Tah dah! */
1810
1811 if (shift) {
1812 lbnRshift_16(d, dlen, shift);
1813 lbnRshift_16(n, dlen, shift);
1814 }
1815
1816 return qhigh;
1817 }
1818 #endif
1819
1820 /*
1821 * Find the negative multiplicative inverse of x (x must be odd!) modulo 2^16.
1822 *
1823 * This just performs Newton's iteration until it gets the
1824 * inverse. The initial estimate is always correct to 3 bits, and
1825 * sometimes 4. The number of valid bits doubles each iteration.
1826 * (To prove it, assume x * y == 1 (mod 2^n), and introduce a variable
1827 * for the error mod 2^2n. x * y == 1 + k*2^n (mod 2^2n) and follow
1828 * the iteration through.)
1829 */
1830 #ifndef lbnMontInv1_16
1831 BNWORD16
lbnMontInv1_16(BNWORD16 const x)1832 lbnMontInv1_16(BNWORD16 const x)
1833 {
1834 BNWORD16 y = x, z;
1835
1836 assert(x & 1);
1837
1838 while ((z = x*y) != 1)
1839 y *= 2 - z;
1840 return -y;
1841 }
1842 #endif /* !lbnMontInv1_16 */
1843
1844 #if defined(BNWORD32) && PRODUCT_SCAN
1845 /*
1846 * Test code for product-scanning Montgomery reduction.
1847 * This seems to slow the C code down rather than speed it up.
1848 *
1849 * The first loop computes the Montgomery multipliers, storing them over
1850 * the low half of the number n.
1851 *
1852 * The second half multiplies the upper half, adding in the modulus
1853 * times the Montgomery multipliers. The results of this multiply
1854 * are stored.
1855 */
1856 void
lbnMontReduce_16(BNWORD16 * n,BNWORD16 const * mod,unsigned mlen,BNWORD16 inv)1857 lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned mlen, BNWORD16 inv)
1858 {
1859 BNWORD32 x, y;
1860 BNWORD16 const *pm;
1861 BNWORD16 *pn;
1862 BNWORD16 t;
1863 unsigned carry;
1864 unsigned i, j;
1865
1866 /* Special case of zero */
1867 if (!mlen)
1868 return;
1869
1870 /* Pass 1 - compute Montgomery multipliers */
1871 /* First iteration can have certain simplifications. */
1872 t = BIGLITTLE(n[-1],n[0]);
1873 x = t;
1874 t *= inv;
1875 BIGLITTLE(n[-1], n[0]) = t;
1876 x += (BNWORD32)t * BIGLITTLE(mod[-1],mod[0]); /* Can't overflow */
1877 assert((BNWORD16)x == 0);
1878 x = x >> 16;
1879
1880 for (i = 1; i < mlen; i++) {
1881 carry = 0;
1882 pn = n;
1883 pm = BIGLITTLE(mod-i-1,mod+i+1);
1884 for (j = 0; j < i; j++) {
1885 y = (BNWORD32)BIGLITTLE(*--pn * *pm++, *pn++ * *--pm);
1886 x += y;
1887 carry += (x < y);
1888 }
1889 assert(BIGLITTLE(pn == n-i, pn == n+i));
1890 y = t = BIGLITTLE(pn[-1], pn[0]);
1891 x += y;
1892 carry += (x < y);
1893 BIGLITTLE(pn[-1], pn[0]) = t = inv * (BNWORD16)x;
1894 assert(BIGLITTLE(pm == mod-1, pm == mod+1));
1895 y = (BNWORD32)t * BIGLITTLE(pm[0],pm[-1]);
1896 x += y;
1897 carry += (x < y);
1898 assert((BNWORD16)x == 0);
1899 x = x >> 16 | (BNWORD32)carry << 16;
1900 }
1901
1902 BIGLITTLE(n -= mlen, n += mlen);
1903
1904 /* Pass 2 - compute upper words and add to n */
1905 for (i = 1; i < mlen; i++) {
1906 carry = 0;
1907 pm = BIGLITTLE(mod-i,mod+i);
1908 pn = n;
1909 for (j = i; j < mlen; j++) {
1910 y = (BNWORD32)BIGLITTLE(*--pm * *pn++, *pm++ * *--pn);
1911 x += y;
1912 carry += (x < y);
1913 }
1914 assert(BIGLITTLE(pm == mod-mlen, pm == mod+mlen));
1915 assert(BIGLITTLE(pn == n+mlen-i, pn == n-mlen+i));
1916 y = t = BIGLITTLE(*(n-i),*(n+i-1));
1917 x += y;
1918 carry += (x < y);
1919 BIGLITTLE(*(n-i),*(n+i-1)) = (BNWORD16)x;
1920 x = (x >> 16) | (BNWORD32)carry << 16;
1921 }
1922
1923 /* Last round of second half, simplified. */
1924 t = BIGLITTLE(*(n-mlen),*(n+mlen-1));
1925 x += t;
1926 BIGLITTLE(*(n-mlen),*(n+mlen-1)) = (BNWORD16)x;
1927 carry = (unsigned)(x >> 16);
1928
1929 while (carry)
1930 carry -= lbnSubN_16(n, mod, mlen);
1931 while (lbnCmp_16(n, mod, mlen) >= 0)
1932 (void)lbnSubN_16(n, mod, mlen);
1933 }
1934 #define lbnMontReduce_16 lbnMontReduce_16
1935 #endif
1936
1937 /*
1938 * Montgomery reduce n, modulo mod. This reduces modulo mod and divides by
1939 * 2^(16*mlen). Returns the result in the *top* mlen words of the argument n.
1940 * This is ready for another multiplication using lbnMul_16.
1941 *
1942 * Montgomery representation is a very useful way to encode numbers when
1943 * you're doing lots of modular reduction. What you do is pick a multiplier
1944 * R which is relatively prime to the modulus and very easy to divide by.
1945 * Since the modulus is odd, R is closen as a power of 2, so the division
1946 * is a shift. In fact, it's a shift of an integral number of words,
1947 * so the shift can be implicit - just drop the low-order words.
1948 *
1949 * Now, choose R *larger* than the modulus m, 2^(16*mlen). Then convert
1950 * all numbers a, b, etc. to Montgomery form M(a), M(b), etc using the
1951 * relationship M(a) = a*R mod m, M(b) = b*R mod m, etc. Note that:
1952 * - The Montgomery form of a number depends on the modulus m.
1953 * A fixed modulus m is assumed throughout this discussion.
1954 * - Since R is relaitvely prime to m, multiplication by R is invertible;
1955 * no information about the numbers is lost, they're just scrambled.
1956 * - Adding (and subtracting) numbers in this form works just as usual.
1957 * M(a+b) = (a+b)*R mod m = (a*R + b*R) mod m = (M(a) + M(b)) mod m
1958 * - Multiplying numbers in this form produces a*b*R*R. The problem
1959 * is to divide out the excess factor of R, modulo m as well as to
1960 * reduce to the given length mlen. It turns out that this can be
1961 * done *faster* than a normal divide, which is where the speedup
1962 * in Montgomery division comes from.
1963 *
1964 * Normal reduction chooses a most-significant quotient digit q and then
1965 * subtracts q*m from the number to be reduced. Choosing q is tricky
1966 * and involved (just look at lbnDiv_16 to see!) and is usually
1967 * imperfect, requiring a check for correction after the subtraction.
1968 *
1969 * Montgomery reduction *adds* a multiple of m to the *low-order* part
1970 * of the number to be reduced. This multiple is chosen to make the
1971 * low-order part of the number come out to zero. This can be done
1972 * with no trickery or error using a precomputed inverse of the modulus.
1973 * In this code, the "part" is one word, but any width can be used.
1974 *
1975 * Repeating this step sufficiently often results in a value which
1976 * is a multiple of R (a power of two, remember) but is still (since
1977 * the additions were to the low-order part and thus did not increase
1978 * the value of the number being reduced very much) still not much
1979 * larger than m*R. Then implicitly divide by R and subtract off
1980 * m until the result is in the correct range.
1981 *
1982 * Since the low-order part being cancelled is less than R, the
1983 * multiple of m added must have a multiplier which is at most R-1.
1984 * Assuming that the input is at most m*R-1, the final number is
1985 * at most m*(2*R-1)-1 = 2*m*R - m - 1, so subtracting m once from
1986 * the high-order part, equivalent to subtracting m*R from the
1987 * while number, produces a result which is at most m*R - m - 1,
1988 * which divided by R is at most m-1.
1989 *
1990 * To convert *to* Montgomery form, you need a regular remainder
1991 * routine, although you can just compute R*R (mod m) and do the
1992 * conversion using Montgomery multiplication. To convert *from*
1993 * Montgomery form, just Montgomery reduce the number to
1994 * remove the extra factor of R.
1995 *
1996 * TODO: Change to a full inverse and use Karatsuba's multiplication
1997 * rather than this word-at-a-time.
1998 */
1999 #ifndef lbnMontReduce_16
2000 void
lbnMontReduce_16(BNWORD16 * n,BNWORD16 const * mod,unsigned const mlen,BNWORD16 inv)2001 lbnMontReduce_16(BNWORD16 *n, BNWORD16 const *mod, unsigned const mlen,
2002 BNWORD16 inv)
2003 {
2004 BNWORD16 t;
2005 BNWORD16 c = 0;
2006 unsigned len = mlen;
2007
2008 /* inv must be the negative inverse of mod's least significant word */
2009 assert((BNWORD16)(inv * BIGLITTLE(mod[-1],mod[0])) == (BNWORD16)-1);
2010
2011 assert(len);
2012
2013 do {
2014 t = lbnMulAdd1_16(n, mod, mlen, inv * BIGLITTLE(n[-1],n[0]));
2015 c += lbnAdd1_16(BIGLITTLE(n-mlen,n+mlen), len, t);
2016 BIGLITTLE(--n,++n);
2017 } while (--len);
2018
2019 /*
2020 * All that adding can cause an overflow past the modulus size,
2021 * but it's unusual, and never by much, so a subtraction loop
2022 * is the right way to deal with it.
2023 * This subtraction happens infrequently - I've only ever seen it
2024 * invoked once per reduction, and then just under 22.5% of the time.
2025 */
2026 while (c)
2027 c -= lbnSubN_16(n, mod, mlen);
2028 while (lbnCmp_16(n, mod, mlen) >= 0)
2029 (void)lbnSubN_16(n, mod, mlen);
2030 }
2031 #endif /* !lbnMontReduce_16 */
2032
2033 /*
2034 * A couple of helpers that you might want to implement atomically
2035 * in asm sometime.
2036 */
2037 #ifndef lbnMontMul_16
2038 /*
2039 * Multiply "num1" by "num2", modulo "mod", all of length "len", and
2040 * place the result in the high half of "prod". "inv" is the inverse
2041 * of the least-significant word of the modulus, modulo 2^16.
2042 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2043 *
2044 * This is implemented as a macro to win on compilers that don't do
2045 * inlining, since it's so trivial.
2046 */
2047 #define lbnMontMul_16(prod, n1, n2, mod, len, inv) \
2048 (lbnMulX_16(prod, n1, n2, len), lbnMontReduce_16(prod, mod, len, inv))
2049 #endif /* !lbnMontMul_16 */
2050
2051 #ifndef lbnMontSquare_16
2052 /*
2053 * Square "n", modulo "mod", both of length "len", and place the result
2054 * in the high half of "prod". "inv" is the inverse of the least-significant
2055 * word of the modulus, modulo 2^16.
2056 * This uses numbers in Montgomery form. Reduce using "len" and "inv".
2057 *
2058 * This is implemented as a macro to win on compilers that don't do
2059 * inlining, since it's so trivial.
2060 */
2061 #define lbnMontSquare_16(prod, n, mod, len, inv) \
2062 (lbnSquare_16(prod, n, len), lbnMontReduce_16(prod, mod, len, inv))
2063
2064 #endif /* !lbnMontSquare_16 */
2065
2066 /*
2067 * Convert a number to Montgomery form - requires mlen + nlen words
2068 * of memory in "n".
2069 */
2070 void
lbnToMont_16(BNWORD16 * n,unsigned nlen,BNWORD16 * mod,unsigned mlen)2071 lbnToMont_16(BNWORD16 *n, unsigned nlen, BNWORD16 *mod, unsigned mlen)
2072 {
2073 /* Move n up "mlen" words */
2074 lbnCopy_16(BIGLITTLE(n-mlen,n+mlen), n, nlen);
2075 lbnZero_16(n, mlen);
2076 /* Do the division - dump the quotient in the high-order words */
2077 (void)lbnDiv_16(BIGLITTLE(n-mlen,n+mlen), n, mlen+nlen, mod, mlen);
2078 }
2079
2080 /*
2081 * Convert from Montgomery form. Montgomery reduction is all that is
2082 * needed.
2083 */
2084 void
lbnFromMont_16(BNWORD16 * n,BNWORD16 * mod,unsigned len)2085 lbnFromMont_16(BNWORD16 *n, BNWORD16 *mod, unsigned len)
2086 {
2087 /* Zero the high words of n */
2088 lbnZero_16(BIGLITTLE(n-len,n+len), len);
2089 lbnMontReduce_16(n, mod, len, lbnMontInv1_16(mod[BIGLITTLE(-1,0)]));
2090 /* Move n down len words */
2091 lbnCopy_16(n, BIGLITTLE(n-len,n+len), len);
2092 }
2093
2094 /*
2095 * The windowed exponentiation algorithm, precomputes a table of odd
2096 * powers of n up to 2^k. See the comment in bnExpMod_16 below for
2097 * an explanation of how it actually works works.
2098 *
2099 * It takes 2^(k-1)-1 multiplies to compute the table, and (e-1)/(k+1)
2100 * multiplies (on average) to perform the exponentiation. To minimize
2101 * the sum, k must vary with e. The optimal window sizes vary with the
2102 * exponent length. Here are some selected values and the boundary cases.
2103 * (An underscore _ has been inserted into some of the numbers to ensure
2104 * that magic strings like 16 do not appear in this table. It should be
2105 * ignored.)
2106 *
2107 * At e = 1 bits, k=1 (0.000000) is best
2108 * At e = 2 bits, k=1 (0.500000) is best
2109 * At e = 4 bits, k=1 (1.500000) is best
2110 * At e = 8 bits, k=2 (3.333333) < k=1 (3.500000)
2111 * At e = 1_6 bits, k=2 (6.000000) is best
2112 * At e = 26 bits, k=3 (9.250000) < k=2 (9.333333)
2113 * At e = 3_2 bits, k=3 (10.750000) is best
2114 * At e = 6_4 bits, k=3 (18.750000) is best
2115 * At e = 82 bits, k=4 (23.200000) < k=3 (23.250000)
2116 * At e = 128 bits, k=4 (3_2.400000) is best
2117 * At e = 242 bits, k=5 (55.1_66667) < k=4 (55.200000)
2118 * At e = 256 bits, k=5 (57.500000) is best
2119 * At e = 512 bits, k=5 (100.1_66667) is best
2120 * At e = 674 bits, k=6 (127.142857) < k=5 (127.1_66667)
2121 * At e = 1024 bits, k=6 (177.142857) is best
2122 * At e = 1794 bits, k=7 (287.125000) < k=6 (287.142857)
2123 * At e = 2048 bits, k=7 (318.875000) is best
2124 * At e = 4096 bits, k=7 (574.875000) is best
2125 *
2126 * The numbers in parentheses are the expected number of multiplications
2127 * needed to do the computation. The normal russian-peasant modular
2128 * exponentiation technique always uses (e-1)/2. For exponents as
2129 * small as 192 bits (below the range of current factoring algorithms),
2130 * half of the multiplies are eliminated, 45.2 as opposed to the naive
2131 * 95.5. Counting the 191 squarings as 3/4 a multiply each (squaring
2132 * proper is just over half of multiplying, but the Montgomery
2133 * reduction in each case is also a multiply), that's 143.25
2134 * multiplies, for totals of 188.45 vs. 238.75 - a 21% savings.
2135 * For larger exponents (like 512 bits), it's 483.92 vs. 639.25, a
2136 * 24.3% savings. It asymptotically approaches 25%.
2137 *
2138 * Um, actually there's a slightly more accurate way to count, which
2139 * really is the average number of multiplies required, averaged
2140 * uniformly over all 2^(e-1) e-bit numbers, from 2^(e-1) to (2^e)-1.
2141 * It's based on the recurrence that for the last b bits, b <= k, at
2142 * most one multiply is needed (and none at all 1/2^b of the time),
2143 * while when b > k, the odds are 1/2 each way that the bit will be
2144 * 0 (meaning no multiplies to reduce it to the b-1-bit case) and
2145 * 1/2 that the bit will be 1, starting a k-bit window and requiring
2146 * 1 multiply beyond the b-k-bit case. Since the most significant
2147 * bit is always 1, a k-bit window always starts there, and that
2148 * multiply is by 1, so it isn't a multiply at all. Thus, the
2149 * number of multiplies is simply that needed for the last e-k bits.
2150 * This recurrence produces:
2151 *
2152 * At e = 1 bits, k=1 (0.000000) is best
2153 * At e = 2 bits, k=1 (0.500000) is best
2154 * At e = 4 bits, k=1 (1.500000) is best
2155 * At e = 6 bits, k=2 (2.437500) < k=1 (2.500000)
2156 * At e = 8 bits, k=2 (3.109375) is best
2157 * At e = 1_6 bits, k=2 (5.777771) is best
2158 * At e = 24 bits, k=3 (8.437629) < k=2 (8.444444)
2159 * At e = 3_2 bits, k=3 (10.437492) is best
2160 * At e = 6_4 bits, k=3 (18.437500) is best
2161 * At e = 81 bits, k=4 (22.6_40000) < k=3 (22.687500)
2162 * At e = 128 bits, k=4 (3_2.040000) is best
2163 * At e = 241 bits, k=5 (54.611111) < k=4 (54.6_40000)
2164 * At e = 256 bits, k=5 (57.111111) is best
2165 * At e = 512 bits, k=5 (99.777778) is best
2166 * At e = 673 bits, k=6 (126.591837) < k=5 (126.611111)
2167 * At e = 1024 bits, k=6 (176.734694) is best
2168 * At e = 1793 bits, k=7 (286.578125) < k=6 (286.591837)
2169 * At e = 2048 bits, k=7 (318.453125) is best
2170 * At e = 4096 bits, k=7 (574.453125) is best
2171 *
2172 * This has the rollover points at 6, 24, 81, 241, 673 and 1793 instead
2173 * of 8, 26, 82, 242, 674, and 1794. Not a very big difference.
2174 * (The numbers past that are k=8 at 4609 and k=9 at 11521,
2175 * vs. one more in each case for the approximation.)
2176 *
2177 * Given that exponents for which k>7 are useful are uncommon,
2178 * a fixed size table for k <= 7 is used for simplicity.
2179 *
2180 * The basic number of squarings needed is e-1, although a k-bit
2181 * window (for k > 1) can save, on average, k-2 of those, too.
2182 * That savings currently isn't counted here. It would drive the
2183 * crossover points slightly lower.
2184 * (Actually, this win is also reduced in the DoubleExpMod case,
2185 * meaning we'd have to split the tables. Except for that, the
2186 * multiplies by powers of the two bases are independent, so
2187 * the same logic applies to each as the single case.)
2188 *
2189 * Table entry i is the largest number of bits in an exponent to
2190 * process with a window size of i+1. Entry 6 is the largest
2191 * possible unsigned number, so the window will never be more
2192 * than 7 bits, requiring 2^6 = 0x40 slots.
2193 */
2194 #define BNEXPMOD_MAX_WINDOW 7
2195 static unsigned const bnExpModThreshTable[BNEXPMOD_MAX_WINDOW] = {
2196 5, 23, 80, 240, 672, 1792, (unsigned)-1
2197 /* 7, 25, 81, 241, 673, 1793, (unsigned)-1 ### The old approximations */
2198 };
2199
2200 /*
2201 * Perform modular exponentiation, as fast as possible! This uses
2202 * Montgomery reduction, optimized squaring, and windowed exponentiation.
2203 * The modulus "mod" MUST be odd!
2204 *
2205 * This returns 0 on success, -1 on out of memory.
2206 *
2207 * The window algorithm:
2208 * The idea is to keep a running product of b1 = n^(high-order bits of exp),
2209 * and then keep appending exponent bits to it. The following patterns
2210 * apply to a 3-bit window (k = 3):
2211 * To append 0: square
2212 * To append 1: square, multiply by n^1
2213 * To append 10: square, multiply by n^1, square
2214 * To append 11: square, square, multiply by n^3
2215 * To append 100: square, multiply by n^1, square, square
2216 * To append 101: square, square, square, multiply by n^5
2217 * To append 110: square, square, multiply by n^3, square
2218 * To append 111: square, square, square, multiply by n^7
2219 *
2220 * Since each pattern involves only one multiply, the longer the pattern
2221 * the better, except that a 0 (no multiplies) can be appended directly.
2222 * We precompute a table of odd powers of n, up to 2^k, and can then
2223 * multiply k bits of exponent at a time. Actually, assuming random
2224 * exponents, there is on average one zero bit between needs to
2225 * multiply (1/2 of the time there's none, 1/4 of the time there's 1,
2226 * 1/8 of the time, there's 2, 1/16 of the time, there's 3, etc.), so
2227 * you have to do one multiply per k+1 bits of exponent.
2228 *
2229 * The loop walks down the exponent, squaring the result buffer as
2230 * it goes. There is a wbits+1 bit lookahead buffer, buf, that is
2231 * filled with the upcoming exponent bits. (What is read after the
2232 * end of the exponent is unimportant, but it is filled with zero here.)
2233 * When the most-significant bit of this buffer becomes set, i.e.
2234 * (buf & tblmask) != 0, we have to decide what pattern to multiply
2235 * by, and when to do it. We decide, remember to do it in future
2236 * after a suitable number of squarings have passed (e.g. a pattern
2237 * of "100" in the buffer requires that we multiply by n^1 immediately;
2238 * a pattern of "110" calls for multiplying by n^3 after one more
2239 * squaring), clear the buffer, and continue.
2240 *
2241 * When we start, there is one more optimization: the result buffer
2242 * is implcitly one, so squaring it or multiplying by it can be
2243 * optimized away. Further, if we start with a pattern like "100"
2244 * in the lookahead window, rather than placing n into the buffer
2245 * and then starting to square it, we have already computed n^2
2246 * to compute the odd-powers table, so we can place that into
2247 * the buffer and save a squaring.
2248 *
2249 * This means that if you have a k-bit window, to compute n^z,
2250 * where z is the high k bits of the exponent, 1/2 of the time
2251 * it requires no squarings. 1/4 of the time, it requires 1
2252 * squaring, ... 1/2^(k-1) of the time, it reqires k-2 squarings.
2253 * And the remaining 1/2^(k-1) of the time, the top k bits are a
2254 * 1 followed by k-1 0 bits, so it again only requires k-2
2255 * squarings, not k-1. The average of these is 1. Add that
2256 * to the one squaring we have to do to compute the table,
2257 * and you'll see that a k-bit window saves k-2 squarings
2258 * as well as reducing the multiplies. (It actually doesn't
2259 * hurt in the case k = 1, either.)
2260 *
2261 * n must have mlen words allocated. Although fewer may be in use
2262 * when n is passed in, all are in use on exit.
2263 */
2264 int
lbnExpMod_16(BNWORD16 * result,BNWORD16 const * n,unsigned nlen,BNWORD16 const * e,unsigned elen,BNWORD16 * mod,unsigned mlen)2265 lbnExpMod_16(BNWORD16 *result, BNWORD16 const *n, unsigned nlen,
2266 BNWORD16 const *e, unsigned elen, BNWORD16 *mod, unsigned mlen)
2267 {
2268 BNWORD16 *table[1 << (BNEXPMOD_MAX_WINDOW-1)];
2269 /* Table of odd powers of n */
2270 unsigned ebits; /* Exponent bits */
2271 unsigned wbits; /* Window size */
2272 unsigned tblmask; /* Mask of exponentiation window */
2273 BNWORD16 bitpos; /* Mask of current look-ahead bit */
2274 unsigned buf; /* Buffer of exponent bits */
2275 unsigned multpos; /* Where to do pending multiply */
2276 BNWORD16 const *mult; /* What to multiply by */
2277 unsigned i; /* Loop counter */
2278 int isone; /* Flag: accum. is implicitly one */
2279 BNWORD16 *a, *b; /* Working buffers/accumulators */
2280 BNWORD16 *t; /* Pointer into the working buffers */
2281 BNWORD16 inv; /* mod^-1 modulo 2^16 */
2282 int y; /* bnYield() result */
2283
2284 assert(mlen);
2285 assert(nlen <= mlen);
2286
2287 /* First, a couple of trivial cases. */
2288 elen = lbnNorm_16(e, elen);
2289 if (!elen) {
2290 /* x ^ 0 == 1 */
2291 lbnZero_16(result, mlen);
2292 BIGLITTLE(result[-1],result[0]) = 1;
2293 return 0;
2294 }
2295 ebits = lbnBits_16(e, elen);
2296 if (ebits == 1) {
2297 /* x ^ 1 == x */
2298 if (n != result)
2299 lbnCopy_16(result, n, nlen);
2300 if (mlen > nlen)
2301 lbnZero_16(BIGLITTLE(result-nlen,result+nlen),
2302 mlen-nlen);
2303 return 0;
2304 }
2305
2306 /* Okay, now move the exponent pointer to the most-significant word */
2307 e = BIGLITTLE(e-elen, e+elen-1);
2308
2309 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2310 wbits = 0;
2311 while (ebits > bnExpModThreshTable[wbits])
2312 wbits++;
2313
2314 /* Allocate working storage: two product buffers and the tables. */
2315 LBNALLOC(a, BNWORD16, 2*mlen);
2316 if (!a)
2317 return -1;
2318 LBNALLOC(b, BNWORD16, 2*mlen);
2319 if (!b) {
2320 LBNFREE(a, 2*mlen);
2321 return -1;
2322 }
2323
2324 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2325 tblmask = 1u << wbits;
2326
2327 /* We have the result buffer available, so use it. */
2328 table[0] = result;
2329
2330 /*
2331 * Okay, we now have a minimal-sized table - expand it.
2332 * This is allowed to fail! If so, scale back the table size
2333 * and proceed.
2334 */
2335 for (i = 1; i < tblmask; i++) {
2336 LBNALLOC(t, BNWORD16, mlen);
2337 if (!t) /* Out of memory! Quit the loop. */
2338 break;
2339 table[i] = t;
2340 }
2341
2342 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2343 while (tblmask > i) {
2344 wbits--;
2345 tblmask >>= 1;
2346 }
2347 /* Free up our overallocations */
2348 while (--i > tblmask)
2349 LBNFREE(table[i], mlen);
2350
2351 /* Okay, fill in the table */
2352
2353 /* Compute the necessary modular inverse */
2354 inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
2355
2356 /* Convert n to Montgomery form */
2357
2358 /* Move n up "mlen" words into a */
2359 t = BIGLITTLE(a-mlen, a+mlen);
2360 lbnCopy_16(t, n, nlen);
2361 lbnZero_16(a, mlen);
2362 /* Do the division - lose the quotient into the high-order words */
2363 (void)lbnDiv_16(t, a, mlen+nlen, mod, mlen);
2364 /* Copy into first table entry */
2365 lbnCopy_16(table[0], a, mlen);
2366
2367 /* Square a into b */
2368 lbnMontSquare_16(b, a, mod, mlen, inv);
2369
2370 /* Use high half of b to initialize the table */
2371 t = BIGLITTLE(b-mlen, b+mlen);
2372 for (i = 1; i < tblmask; i++) {
2373 lbnMontMul_16(a, t, table[i-1], mod, mlen, inv);
2374 lbnCopy_16(table[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2375 #if BNYIELD
2376 if (bnYield && (y = bnYield()) < 0)
2377 goto yield;
2378 #endif
2379 }
2380
2381 /* We might use b = n^2 later... */
2382
2383 /* Initialze the fetch pointer */
2384 bitpos = (BNWORD16)1 << ((ebits-1) & (16-1)); /* Initialize mask */
2385
2386 /* This should point to the msbit of e */
2387 assert((*e & bitpos) != 0);
2388
2389 /*
2390 * Pre-load the window. Becuase the window size is
2391 * never larger than the exponent size, there is no need to
2392 * detect running off the end of e in here.
2393 *
2394 * The read-ahead is controlled by elen and the bitpos mask.
2395 * Note that this is *ahead* of ebits, which tracks the
2396 * most significant end of the window. The purpose of this
2397 * initialization is to get the two wbits+1 bits apart,
2398 * like they should be.
2399 *
2400 * Note that bitpos and e1len together keep track of the
2401 * lookahead read pointer in the exponent that is used here.
2402 */
2403 buf = 0;
2404 for (i = 0; i <= wbits; i++) {
2405 buf = (buf << 1) | ((*e & bitpos) != 0);
2406 bitpos >>= 1;
2407 if (!bitpos) {
2408 BIGLITTLE(e++,e--);
2409 bitpos = (BNWORD16)1 << (16-1);
2410 elen--;
2411 }
2412 }
2413 assert(buf & tblmask);
2414
2415 /*
2416 * Set the pending multiply positions to a location that will
2417 * never be encountered, thus ensuring that nothing will happen
2418 * until the need for a multiply appears and one is scheduled.
2419 */
2420 multpos = ebits; /* A NULL value */
2421 mult = 0; /* Force a crash if we use these */
2422
2423 /*
2424 * Okay, now begins the real work. The first step is
2425 * slightly magic, so it's done outside the main loop,
2426 * but it's very similar to what's inside.
2427 */
2428 ebits--; /* Start processing the first bit... */
2429 isone = 1;
2430
2431 /*
2432 * This is just like the multiply in the loop, except that
2433 * - We know the msbit of buf is set, and
2434 * - We have the extra value n^2 floating around.
2435 * So, do the usual computation, and if the result is that
2436 * the buffer should be multiplied by n^1 immediately
2437 * (which we'd normally then square), we multiply it
2438 * (which reduces to a copy, which reduces to setting a flag)
2439 * by n^2 and skip the squaring. Thus, we do the
2440 * multiply and the squaring in one step.
2441 */
2442 assert(buf & tblmask);
2443 multpos = ebits - wbits;
2444 while ((buf & 1) == 0) {
2445 buf >>= 1;
2446 multpos++;
2447 }
2448 /* Intermediates can wrap, but final must NOT */
2449 assert(multpos <= ebits);
2450 mult = table[buf>>1];
2451 buf = 0;
2452
2453 /* Special case: use already-computed value sitting in buffer */
2454 if (multpos == ebits)
2455 isone = 0;
2456
2457 /*
2458 * At this point, the buffer (which is the high half of b) holds
2459 * either 1 (implicitly, as the "isone" flag is set), or n^2.
2460 */
2461
2462 /*
2463 * The main loop. The procedure is:
2464 * - Advance the window
2465 * - If the most-significant bit of the window is set,
2466 * schedule a multiply for the appropriate time in the
2467 * future (may be immediately)
2468 * - Perform any pending multiples
2469 * - Check for termination
2470 * - Square the buffer
2471 *
2472 * At any given time, the acumulated product is held in
2473 * the high half of b.
2474 */
2475 for (;;) {
2476 ebits--;
2477
2478 /* Advance the window */
2479 assert(buf < tblmask);
2480 buf <<= 1;
2481 /*
2482 * This reads ahead of the current exponent position
2483 * (controlled by ebits), so we have to be able to read
2484 * past the lsb of the exponents without error.
2485 */
2486 if (elen) {
2487 buf |= ((*e & bitpos) != 0);
2488 bitpos >>= 1;
2489 if (!bitpos) {
2490 BIGLITTLE(e++,e--);
2491 bitpos = (BNWORD16)1 << (16-1);
2492 elen--;
2493 }
2494 }
2495
2496 /* Examine the window for pending multiplies */
2497 if (buf & tblmask) {
2498 multpos = ebits - wbits;
2499 while ((buf & 1) == 0) {
2500 buf >>= 1;
2501 multpos++;
2502 }
2503 /* Intermediates can wrap, but final must NOT */
2504 assert(multpos <= ebits);
2505 mult = table[buf>>1];
2506 buf = 0;
2507 }
2508
2509 /* If we have a pending multiply, do it */
2510 if (ebits == multpos) {
2511 /* Multiply by the table entry remembered previously */
2512 t = BIGLITTLE(b-mlen, b+mlen);
2513 if (isone) {
2514 /* Multiply by 1 is a trivial case */
2515 lbnCopy_16(t, mult, mlen);
2516 isone = 0;
2517 } else {
2518 lbnMontMul_16(a, t, mult, mod, mlen, inv);
2519 /* Swap a and b */
2520 t = a; a = b; b = t;
2521 }
2522 }
2523
2524 /* Are we done? */
2525 if (!ebits)
2526 break;
2527
2528 /* Square the input */
2529 if (!isone) {
2530 t = BIGLITTLE(b-mlen, b+mlen);
2531 lbnMontSquare_16(a, t, mod, mlen, inv);
2532 /* Swap a and b */
2533 t = a; a = b; b = t;
2534 }
2535 #if BNYIELD
2536 if (bnYield && (y = bnYield()) < 0)
2537 goto yield;
2538 #endif
2539 } /* for (;;) */
2540
2541 assert(!isone);
2542 assert(!buf);
2543
2544 /* DONE! */
2545
2546 /* Convert result out of Montgomery form */
2547 t = BIGLITTLE(b-mlen, b+mlen);
2548 lbnCopy_16(b, t, mlen);
2549 lbnZero_16(t, mlen);
2550 lbnMontReduce_16(b, mod, mlen, inv);
2551 lbnCopy_16(result, t, mlen);
2552 /*
2553 * Clean up - free intermediate storage.
2554 * Do NOT free table[0], which is the result
2555 * buffer.
2556 */
2557 y = 0;
2558 #if BNYIELD
2559 yield:
2560 #endif
2561 while (--tblmask)
2562 LBNFREE(table[tblmask], mlen);
2563 LBNFREE(b, 2*mlen);
2564 LBNFREE(a, 2*mlen);
2565
2566 return y; /* Success */
2567 }
2568
2569 #if 0
2570 /*
2571 * Compute and return n1^e1 * n2^e2 mod "mod".
2572 * result may be either input buffer, or something separate.
2573 * It must be "mlen" words long.
2574 *
2575 * There is a current position in the exponents, which is kept in e1bits.
2576 * (The exponents are swapped if necessary so e1 is the longer of the two.)
2577 * At any given time, the value in the accumulator is
2578 * n1^(e1>>e1bits) * n2^(e2>>e1bits) mod "mod".
2579 * As e1bits is counted down, this is updated, by squaring it and doing
2580 * any necessary multiplies.
2581 * To decide on the necessary multiplies, two windows, each w1bits+1 bits
2582 * wide, are maintained in buf1 and buf2, which read *ahead* of the
2583 * e1bits position (with appropriate handling of the case when e1bits
2584 * drops below w1bits+1). When the most-significant bit of either window
2585 * becomes set, indicating that something needs to be multiplied by
2586 * the accumulator or it will get out of sync, the window is examined
2587 * to see which power of n1 or n2 to multiply by, and when (possibly
2588 * later, if the power is greater than 1) the multiply should take
2589 * place. Then the multiply and its location are remembered and the
2590 * window is cleared.
2591 *
2592 * If we had every power of n1 in the table, the multiply would always
2593 * be w1bits steps in the future. But we only keep the odd powers,
2594 * so instead of waiting w1bits squarings and then multiplying
2595 * by n1^k, we wait w1bits-k squarings and multiply by n1.
2596 *
2597 * Actually, w2bits can be less than w1bits, but the window is the same
2598 * size, to make it easier to keep track of where we're reading. The
2599 * appropriate number of low-order bits of the window are just ignored.
2600 */
2601 int
2602 lbnDoubleExpMod_16(BNWORD16 *result,
2603 BNWORD16 const *n1, unsigned n1len,
2604 BNWORD16 const *e1, unsigned e1len,
2605 BNWORD16 const *n2, unsigned n2len,
2606 BNWORD16 const *e2, unsigned e2len,
2607 BNWORD16 *mod, unsigned mlen)
2608 {
2609 BNWORD16 *table1[1 << (BNEXPMOD_MAX_WINDOW-1)];
2610 /* Table of odd powers of n1 */
2611 BNWORD16 *table2[1 << (BNEXPMOD_MAX_WINDOW-1)];
2612 /* Table of odd powers of n2 */
2613 unsigned e1bits, e2bits; /* Exponent bits */
2614 unsigned w1bits, w2bits; /* Window sizes */
2615 unsigned tblmask; /* Mask of exponentiation window */
2616 BNWORD16 bitpos; /* Mask of current look-ahead bit */
2617 unsigned buf1, buf2; /* Buffer of exponent bits */
2618 unsigned mult1pos, mult2pos; /* Where to do pending multiply */
2619 BNWORD16 const *mult1, *mult2; /* What to multiply by */
2620 unsigned i; /* Loop counter */
2621 int isone; /* Flag: accum. is implicitly one */
2622 BNWORD16 *a, *b; /* Working buffers/accumulators */
2623 BNWORD16 *t; /* Pointer into the working buffers */
2624 BNWORD16 inv; /* mod^-1 modulo 2^16 */
2625 int y; /* bnYield() result */
2626
2627 assert(mlen);
2628 assert(n1len <= mlen);
2629 assert(n2len <= mlen);
2630
2631 /* First, a couple of trivial cases. */
2632 e1len = lbnNorm_16(e1, e1len);
2633 e2len = lbnNorm_16(e2, e2len);
2634
2635 /* Ensure that the first exponent is the longer */
2636 e1bits = lbnBits_16(e1, e1len);
2637 e2bits = lbnBits_16(e2, e2len);
2638 if (e1bits < e2bits) {
2639 i = e1len; e1len = e2len; e2len = i;
2640 i = e1bits; e1bits = e2bits; e2bits = i;
2641 t = (BNWORD16 *)n1; n1 = n2; n2 = t;
2642 t = (BNWORD16 *)e1; e1 = e2; e2 = t;
2643 }
2644 assert(e1bits >= e2bits);
2645
2646 /* Handle a trivial case */
2647 if (!e2len)
2648 return lbnExpMod_16(result, n1, n1len, e1, e1len, mod, mlen);
2649 assert(e2bits);
2650
2651 /* The code below fucks up if the exponents aren't at least 2 bits */
2652 if (e1bits == 1) {
2653 assert(e2bits == 1);
2654
2655 LBNALLOC(a, BNWORD16, n1len+n2len);
2656 if (!a)
2657 return -1;
2658
2659 lbnMul_16(a, n1, n1len, n2, n2len);
2660 /* Do a direct modular reduction */
2661 if (n1len + n2len >= mlen)
2662 (void)lbnDiv_16(a+mlen, a, n1len+n2len, mod, mlen);
2663 lbnCopy_16(result, a, mlen);
2664 LBNFREE(a, n1len+n2len);
2665 return 0;
2666 }
2667
2668 /* Okay, now move the exponent pointers to the most-significant word */
2669 e1 = BIGLITTLE(e1-e1len, e1+e1len-1);
2670 e2 = BIGLITTLE(e2-e2len, e2+e2len-1);
2671
2672 /* Look up appropriate k-1 for the exponent - tblmask = 1<<(k-1) */
2673 w1bits = 0;
2674 while (e1bits > bnExpModThreshTable[w1bits])
2675 w1bits++;
2676 w2bits = 0;
2677 while (e2bits > bnExpModThreshTable[w2bits])
2678 w2bits++;
2679
2680 assert(w1bits >= w2bits);
2681
2682 /* Allocate working storage: two product buffers and the tables. */
2683 LBNALLOC(a, BNWORD16, 2*mlen);
2684 if (!a)
2685 return -1;
2686 LBNALLOC(b, BNWORD16, 2*mlen);
2687 if (!b) {
2688 LBNFREE(a, 2*mlen);
2689 return -1;
2690 }
2691
2692 /* Convert to the appropriate table size: tblmask = 1<<(k-1) */
2693 tblmask = 1u << w1bits;
2694 /* Use buf2 for its size, temporarily */
2695 buf2 = 1u << w2bits;
2696
2697 LBNALLOC(t, BNWORD16, mlen);
2698 if (!t) {
2699 LBNFREE(b, 2*mlen);
2700 LBNFREE(a, 2*mlen);
2701 return -1;
2702 }
2703 table1[0] = t;
2704 table2[0] = result;
2705
2706 /*
2707 * Okay, we now have some minimal-sized tables - expand them.
2708 * This is allowed to fail! If so, scale back the table sizes
2709 * and proceed. We allocate both tables at the same time
2710 * so if it fails partway through, they'll both be a reasonable
2711 * size rather than one huge and one tiny.
2712 * When i passes buf2 (the number of entries in the e2 window,
2713 * which may be less than the number of entries in the e1 window),
2714 * stop allocating e2 space.
2715 */
2716 for (i = 1; i < tblmask; i++) {
2717 LBNALLOC(t, BNWORD16, mlen);
2718 if (!t) /* Out of memory! Quit the loop. */
2719 break;
2720 table1[i] = t;
2721 if (i < buf2) {
2722 LBNALLOC(t, BNWORD16, mlen);
2723 if (!t) {
2724 LBNFREE(table1[i], mlen);
2725 break;
2726 }
2727 table2[i] = t;
2728 }
2729 }
2730
2731 /* If we stopped, with i < tblmask, shrink the tables appropriately */
2732 while (tblmask > i) {
2733 w1bits--;
2734 tblmask >>= 1;
2735 }
2736 /* Free up our overallocations */
2737 while (--i > tblmask) {
2738 if (i < buf2)
2739 LBNFREE(table2[i], mlen);
2740 LBNFREE(table1[i], mlen);
2741 }
2742 /* And shrink the second window too, if needed */
2743 if (w2bits > w1bits) {
2744 w2bits = w1bits;
2745 buf2 = tblmask;
2746 }
2747
2748 /*
2749 * From now on, use the w2bits variable for the difference
2750 * between w1bits and w2bits.
2751 */
2752 w2bits = w1bits-w2bits;
2753
2754 /* Okay, fill in the tables */
2755
2756 /* Compute the necessary modular inverse */
2757 inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
2758
2759 /* Convert n1 to Montgomery form */
2760
2761 /* Move n1 up "mlen" words into a */
2762 t = BIGLITTLE(a-mlen, a+mlen);
2763 lbnCopy_16(t, n1, n1len);
2764 lbnZero_16(a, mlen);
2765 /* Do the division - lose the quotient into the high-order words */
2766 (void)lbnDiv_16(t, a, mlen+n1len, mod, mlen);
2767 /* Copy into first table entry */
2768 lbnCopy_16(table1[0], a, mlen);
2769
2770 /* Square a into b */
2771 lbnMontSquare_16(b, a, mod, mlen, inv);
2772
2773 /* Use high half of b to initialize the first table */
2774 t = BIGLITTLE(b-mlen, b+mlen);
2775 for (i = 1; i < tblmask; i++) {
2776 lbnMontMul_16(a, t, table1[i-1], mod, mlen, inv);
2777 lbnCopy_16(table1[i], BIGLITTLE(a-mlen, a+mlen), mlen);
2778 #if BNYIELD
2779 if (bnYield && (y = bnYield()) < 0)
2780 goto yield;
2781 #endif
2782 }
2783
2784 /* Convert n2 to Montgomery form */
2785
2786 t = BIGLITTLE(a-mlen, a+mlen);
2787 /* Move n2 up "mlen" words into a */
2788 lbnCopy_16(t, n2, n2len);
2789 lbnZero_16(a, mlen);
2790 /* Do the division - lose the quotient into the high-order words */
2791 (void)lbnDiv_16(t, a, mlen+n2len, mod, mlen);
2792 /* Copy into first table entry */
2793 lbnCopy_16(table2[0], a, mlen);
2794
2795 /* Square it into a */
2796 lbnMontSquare_16(a, table2[0], mod, mlen, inv);
2797 /* Copy to b, low half */
2798 lbnCopy_16(b, t, mlen);
2799
2800 /* Use b to initialize the second table */
2801 for (i = 1; i < buf2; i++) {
2802 lbnMontMul_16(a, b, table2[i-1], mod, mlen, inv);
2803 lbnCopy_16(table2[i], t, mlen);
2804 #if BNYIELD
2805 if (bnYield && (y = bnYield()) < 0)
2806 goto yield;
2807 #endif
2808 }
2809
2810 /*
2811 * Okay, a recap: at this point, the low part of b holds
2812 * n2^2, the high part holds n1^2, and the tables are
2813 * initialized with the odd powers of n1 and n2 from 1
2814 * through 2*tblmask-1 and 2*buf2-1.
2815 *
2816 * We might use those squares in b later, or we might not.
2817 */
2818
2819 /* Initialze the fetch pointer */
2820 bitpos = (BNWORD16)1 << ((e1bits-1) & (16-1)); /* Initialize mask */
2821
2822 /* This should point to the msbit of e1 */
2823 assert((*e1 & bitpos) != 0);
2824
2825 /*
2826 * Pre-load the windows. Becuase the window size is
2827 * never larger than the exponent size, there is no need to
2828 * detect running off the end of e1 in here.
2829 *
2830 * The read-ahead is controlled by e1len and the bitpos mask.
2831 * Note that this is *ahead* of e1bits, which tracks the
2832 * most significant end of the window. The purpose of this
2833 * initialization is to get the two w1bits+1 bits apart,
2834 * like they should be.
2835 *
2836 * Note that bitpos and e1len together keep track of the
2837 * lookahead read pointer in the exponent that is used here.
2838 * e2len is not decremented, it is only ever compared with
2839 * e1len as *that* is decremented.
2840 */
2841 buf1 = buf2 = 0;
2842 for (i = 0; i <= w1bits; i++) {
2843 buf1 = (buf1 << 1) | ((*e1 & bitpos) != 0);
2844 if (e1len <= e2len)
2845 buf2 = (buf2 << 1) | ((*e2 & bitpos) != 0);
2846 bitpos >>= 1;
2847 if (!bitpos) {
2848 BIGLITTLE(e1++,e1--);
2849 if (e1len <= e2len)
2850 BIGLITTLE(e2++,e2--);
2851 bitpos = (BNWORD16)1 << (16-1);
2852 e1len--;
2853 }
2854 }
2855 assert(buf1 & tblmask);
2856
2857 /*
2858 * Set the pending multiply positions to a location that will
2859 * never be encountered, thus ensuring that nothing will happen
2860 * until the need for a multiply appears and one is scheduled.
2861 */
2862 mult1pos = mult2pos = e1bits; /* A NULL value */
2863 mult1 = mult2 = 0; /* Force a crash if we use these */
2864
2865 /*
2866 * Okay, now begins the real work. The first step is
2867 * slightly magic, so it's done outside the main loop,
2868 * but it's very similar to what's inside.
2869 */
2870 isone = 1; /* Buffer is implicitly 1, so replace * by copy */
2871 e1bits--; /* Start processing the first bit... */
2872
2873 /*
2874 * This is just like the multiply in the loop, except that
2875 * - We know the msbit of buf1 is set, and
2876 * - We have the extra value n1^2 floating around.
2877 * So, do the usual computation, and if the result is that
2878 * the buffer should be multiplied by n1^1 immediately
2879 * (which we'd normally then square), we multiply it
2880 * (which reduces to a copy, which reduces to setting a flag)
2881 * by n1^2 and skip the squaring. Thus, we do the
2882 * multiply and the squaring in one step.
2883 */
2884 assert(buf1 & tblmask);
2885 mult1pos = e1bits - w1bits;
2886 while ((buf1 & 1) == 0) {
2887 buf1 >>= 1;
2888 mult1pos++;
2889 }
2890 /* Intermediates can wrap, but final must NOT */
2891 assert(mult1pos <= e1bits);
2892 mult1 = table1[buf1>>1];
2893 buf1 = 0;
2894
2895 /* Special case: use already-computed value sitting in buffer */
2896 if (mult1pos == e1bits)
2897 isone = 0;
2898
2899 /*
2900 * The first multiply by a power of n2. Similar, but
2901 * we might not even want to schedule a multiply if e2 is
2902 * shorter than e1, and the window might be shorter so
2903 * we have to leave the low w2bits bits alone.
2904 */
2905 if (buf2 & tblmask) {
2906 /* Remember low-order bits for later */
2907 i = buf2 & ((1u << w2bits) - 1);
2908 buf2 >>= w2bits;
2909 mult2pos = e1bits - w1bits + w2bits;
2910 while ((buf2 & 1) == 0) {
2911 buf2 >>= 1;
2912 mult2pos++;
2913 }
2914 assert(mult2pos <= e1bits);
2915 mult2 = table2[buf2>>1];
2916 buf2 = i;
2917
2918 if (mult2pos == e1bits) {
2919 t = BIGLITTLE(b-mlen, b+mlen);
2920 if (isone) {
2921 lbnCopy_16(t, b, mlen); /* Copy low to high */
2922 isone = 0;
2923 } else {
2924 lbnMontMul_16(a, t, b, mod, mlen, inv);
2925 t = a; a = b; b = t;
2926 }
2927 }
2928 }
2929
2930 /*
2931 * At this point, the buffer (which is the high half of b)
2932 * holds either 1 (implicitly, as the "isone" flag is set),
2933 * n1^2, n2^2 or n1^2 * n2^2.
2934 */
2935
2936 /*
2937 * The main loop. The procedure is:
2938 * - Advance the windows
2939 * - If the most-significant bit of a window is set,
2940 * schedule a multiply for the appropriate time in the
2941 * future (may be immediately)
2942 * - Perform any pending multiples
2943 * - Check for termination
2944 * - Square the buffers
2945 *
2946 * At any given time, the acumulated product is held in
2947 * the high half of b.
2948 */
2949 for (;;) {
2950 e1bits--;
2951
2952 /* Advance the windows */
2953 assert(buf1 < tblmask);
2954 buf1 <<= 1;
2955 assert(buf2 < tblmask);
2956 buf2 <<= 1;
2957 /*
2958 * This reads ahead of the current exponent position
2959 * (controlled by e1bits), so we have to be able to read
2960 * past the lsb of the exponents without error.
2961 */
2962 if (e1len) {
2963 buf1 |= ((*e1 & bitpos) != 0);
2964 if (e1len <= e2len)
2965 buf2 |= ((*e2 & bitpos) != 0);
2966 bitpos >>= 1;
2967 if (!bitpos) {
2968 BIGLITTLE(e1++,e1--);
2969 if (e1len <= e2len)
2970 BIGLITTLE(e2++,e2--);
2971 bitpos = (BNWORD16)1 << (16-1);
2972 e1len--;
2973 }
2974 }
2975
2976 /* Examine the first window for pending multiplies */
2977 if (buf1 & tblmask) {
2978 mult1pos = e1bits - w1bits;
2979 while ((buf1 & 1) == 0) {
2980 buf1 >>= 1;
2981 mult1pos++;
2982 }
2983 /* Intermediates can wrap, but final must NOT */
2984 assert(mult1pos <= e1bits);
2985 mult1 = table1[buf1>>1];
2986 buf1 = 0;
2987 }
2988
2989 /*
2990 * Examine the second window for pending multiplies.
2991 * Window 2 can be smaller than window 1, but we
2992 * keep the same number of bits in buf2, so we need
2993 * to ignore any low-order bits in the buffer when
2994 * computing what to multiply by, and recompute them
2995 * later.
2996 */
2997 if (buf2 & tblmask) {
2998 /* Remember low-order bits for later */
2999 i = buf2 & ((1u << w2bits) - 1);
3000 buf2 >>= w2bits;
3001 mult2pos = e1bits - w1bits + w2bits;
3002 while ((buf2 & 1) == 0) {
3003 buf2 >>= 1;
3004 mult2pos++;
3005 }
3006 assert(mult2pos <= e1bits);
3007 mult2 = table2[buf2>>1];
3008 buf2 = i;
3009 }
3010
3011
3012 /* If we have a pending multiply for e1, do it */
3013 if (e1bits == mult1pos) {
3014 /* Multiply by the table entry remembered previously */
3015 t = BIGLITTLE(b-mlen, b+mlen);
3016 if (isone) {
3017 /* Multiply by 1 is a trivial case */
3018 lbnCopy_16(t, mult1, mlen);
3019 isone = 0;
3020 } else {
3021 lbnMontMul_16(a, t, mult1, mod, mlen, inv);
3022 /* Swap a and b */
3023 t = a; a = b; b = t;
3024 }
3025 }
3026
3027 /* If we have a pending multiply for e2, do it */
3028 if (e1bits == mult2pos) {
3029 /* Multiply by the table entry remembered previously */
3030 t = BIGLITTLE(b-mlen, b+mlen);
3031 if (isone) {
3032 /* Multiply by 1 is a trivial case */
3033 lbnCopy_16(t, mult2, mlen);
3034 isone = 0;
3035 } else {
3036 lbnMontMul_16(a, t, mult2, mod, mlen, inv);
3037 /* Swap a and b */
3038 t = a; a = b; b = t;
3039 }
3040 }
3041
3042 /* Are we done? */
3043 if (!e1bits)
3044 break;
3045
3046 /* Square the buffer */
3047 if (!isone) {
3048 t = BIGLITTLE(b-mlen, b+mlen);
3049 lbnMontSquare_16(a, t, mod, mlen, inv);
3050 /* Swap a and b */
3051 t = a; a = b; b = t;
3052 }
3053 #if BNYIELD
3054 if (bnYield && (y = bnYield()) < 0)
3055 goto yield;
3056 #endif
3057 } /* for (;;) */
3058
3059 assert(!isone);
3060 assert(!buf1);
3061 assert(!buf2);
3062
3063 /* DONE! */
3064
3065 /* Convert result out of Montgomery form */
3066 t = BIGLITTLE(b-mlen, b+mlen);
3067 lbnCopy_16(b, t, mlen);
3068 lbnZero_16(t, mlen);
3069 lbnMontReduce_16(b, mod, mlen, inv);
3070 lbnCopy_16(result, t, mlen);
3071
3072 /* Clean up - free intermediate storage */
3073 y = 0;
3074 #if BNYIELD
3075 yield:
3076 #endif
3077 buf2 = tblmask >> w2bits;
3078 while (--tblmask) {
3079 if (tblmask < buf2)
3080 LBNFREE(table2[tblmask], mlen);
3081 LBNFREE(table1[tblmask], mlen);
3082 }
3083 t = table1[0];
3084 LBNFREE(t, mlen);
3085 LBNFREE(b, 2*mlen);
3086 LBNFREE(a, 2*mlen);
3087
3088 return y; /* Success */
3089 }
3090 #endif
3091
3092 /*
3093 * 2^exp (mod mod). This is an optimized version for use in Fermat
3094 * tests. The input value of n is ignored; it is returned with
3095 * "mlen" words valid.
3096 */
3097 int
lbnTwoExpMod_16(BNWORD16 * n,BNWORD16 const * exp,unsigned elen,BNWORD16 * mod,unsigned mlen)3098 lbnTwoExpMod_16(BNWORD16 *n, BNWORD16 const *exp, unsigned elen,
3099 BNWORD16 *mod, unsigned mlen)
3100 {
3101 unsigned e; /* Copy of high words of the exponent */
3102 unsigned bits; /* Assorted counter of bits */
3103 BNWORD16 const *bitptr;
3104 BNWORD16 bitword, bitpos;
3105 BNWORD16 *a, *b, *a1;
3106 BNWORD16 inv;
3107 int y; /* Result of bnYield() */
3108
3109 assert(mlen);
3110
3111 bitptr = BIGLITTLE(exp-elen, exp+elen-1);
3112 bitword = *bitptr;
3113 assert(bitword);
3114
3115 /* Clear n for future use. */
3116 lbnZero_16(n, mlen);
3117
3118 bits = lbnBits_16(exp, elen);
3119
3120 /* First, a couple of trivial cases. */
3121 if (bits <= 1) {
3122 /* 2 ^ 0 == 1, 2 ^ 1 == 2 */
3123 BIGLITTLE(n[-1],n[0]) = (BNWORD16)1<<elen;
3124 return 0;
3125 }
3126
3127 /* Set bitpos to the most significant bit */
3128 bitpos = (BNWORD16)1 << ((bits-1) & (16-1));
3129
3130 /* Now, count the bits in the modulus. */
3131 bits = lbnBits_16(mod, mlen);
3132 assert(bits > 1); /* a 1-bit modulus is just stupid... */
3133
3134 /*
3135 * We start with 1<<e, where "e" is as many high bits of the
3136 * exponent as we can manage without going over the modulus.
3137 * This first loop finds "e".
3138 */
3139 e = 1;
3140 while (elen) {
3141 /* Consume the first bit */
3142 bitpos >>= 1;
3143 if (!bitpos) {
3144 if (!--elen)
3145 break;
3146 bitword = BIGLITTLE(*++bitptr,*--bitptr);
3147 bitpos = (BNWORD16)1<<(16-1);
3148 }
3149 e = (e << 1) | ((bitpos & bitword) != 0);
3150 if (e >= bits) { /* Overflow! Back out. */
3151 e >>= 1;
3152 break;
3153 }
3154 }
3155 /*
3156 * The bit in "bitpos" being examined by the bit buffer has NOT
3157 * been consumed yet. This may be past the end of the exponent,
3158 * in which case elen == 1.
3159 */
3160
3161 /* Okay, now, set bit "e" in n. n is already zero. */
3162 inv = (BNWORD16)1 << (e & (16-1));
3163 e /= 16;
3164 BIGLITTLE(n[-e-1],n[e]) = inv;
3165 /*
3166 * The effective length of n in words is now "e+1".
3167 * This is used a little bit later.
3168 */
3169
3170 if (!elen)
3171 return 0; /* That was easy! */
3172
3173 /*
3174 * We have now processed the first few bits. The next step
3175 * is to convert this to Montgomery form for further squaring.
3176 */
3177
3178 /* Allocate working storage: two product buffers */
3179 LBNALLOC(a, BNWORD16, 2*mlen);
3180 if (!a)
3181 return -1;
3182 LBNALLOC(b, BNWORD16, 2*mlen);
3183 if (!b) {
3184 LBNFREE(a, 2*mlen);
3185 return -1;
3186 }
3187
3188 /* Convert n to Montgomery form */
3189 inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
3190 assert(inv & 1); /* Modulus must be odd */
3191 inv = lbnMontInv1_16(inv);
3192 /* Move n (length e+1, remember?) up "mlen" words into b */
3193 /* Note that we lie about a1 for a bit - it's pointing to b */
3194 a1 = BIGLITTLE(b-mlen,b+mlen);
3195 lbnCopy_16(a1, n, e+1);
3196 lbnZero_16(b, mlen);
3197 /* Do the division - dump the quotient into the high-order words */
3198 (void)lbnDiv_16(a1, b, mlen+e+1, mod, mlen);
3199 /*
3200 * Now do the first squaring and modular reduction to put
3201 * the number up in a1 where it belongs.
3202 */
3203 lbnMontSquare_16(a, b, mod, mlen, inv);
3204 /* Fix up a1 to point to where it should go. */
3205 a1 = BIGLITTLE(a-mlen,a+mlen);
3206
3207 /*
3208 * Okay, now, a1 holds the number being accumulated, and
3209 * b is a scratch register. Start working:
3210 */
3211 for (;;) {
3212 /*
3213 * Is the bit set? If so, double a1 as well.
3214 * A modular doubling like this is very cheap.
3215 */
3216 if (bitpos & bitword) {
3217 /*
3218 * Double the number. If there was a carry out OR
3219 * the result is greater than the modulus, subract
3220 * the modulus.
3221 */
3222 if (lbnDouble_16(a1, mlen) ||
3223 lbnCmp_16(a1, mod, mlen) > 0)
3224 (void)lbnSubN_16(a1, mod, mlen);
3225 }
3226
3227 /* Advance to the next exponent bit */
3228 bitpos >>= 1;
3229 if (!bitpos) {
3230 if (!--elen)
3231 break; /* Done! */
3232 bitword = BIGLITTLE(*++bitptr,*--bitptr);
3233 bitpos = (BNWORD16)1<<(16-1);
3234 }
3235
3236 /*
3237 * The elen/bitword/bitpos bit buffer is known to be
3238 * non-empty, i.e. there is at least one more unconsumed bit.
3239 * Thus, it's safe to square the number.
3240 */
3241 lbnMontSquare_16(b, a1, mod, mlen, inv);
3242 /* Rename result (in b) back to a (a1, really). */
3243 a1 = b; b = a; a = a1;
3244 a1 = BIGLITTLE(a-mlen,a+mlen);
3245 #if BNYIELD
3246 if (bnYield && (y = bnYield()) < 0)
3247 goto yield;
3248 #endif
3249 }
3250
3251 /* DONE! Just a little bit of cleanup... */
3252
3253 /*
3254 * Convert result out of Montgomery form... this is
3255 * just a Montgomery reduction.
3256 */
3257 lbnCopy_16(a, a1, mlen);
3258 lbnZero_16(a1, mlen);
3259 lbnMontReduce_16(a, mod, mlen, inv);
3260 lbnCopy_16(n, a1, mlen);
3261
3262 /* Clean up - free intermediate storage */
3263 y = 0;
3264 #if BNYIELD
3265 yield:
3266 #endif
3267 LBNFREE(b, 2*mlen);
3268 LBNFREE(a, 2*mlen);
3269
3270 return y; /* Success */
3271 }
3272
3273
3274 /*
3275 * Returns a substring of the big-endian array of bytes representation
3276 * of the bignum array based on two parameters, the least significant
3277 * byte number (0 to start with the least significant byte) and the
3278 * length. I.e. the number returned is a representation of
3279 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3280 *
3281 * It is an error if the bignum is not at least buflen + lsbyte bytes
3282 * long.
3283 *
3284 * This code assumes that the compiler has the minimal intelligence
3285 * neded to optimize divides and modulo operations on an unsigned data
3286 * type with a power of two.
3287 */
3288 void
lbnExtractBigBytes_16(BNWORD16 const * n,unsigned char * buf,unsigned lsbyte,unsigned buflen)3289 lbnExtractBigBytes_16(BNWORD16 const *n, unsigned char *buf,
3290 unsigned lsbyte, unsigned buflen)
3291 {
3292 BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
3293 unsigned shift;
3294
3295 lsbyte += buflen;
3296
3297 shift = (8 * lsbyte) % 16;
3298 lsbyte /= (16/8); /* Convert to word offset */
3299 BIGLITTLE(n -= lsbyte, n += lsbyte);
3300
3301 if (shift)
3302 t = BIGLITTLE(n[-1],n[0]);
3303
3304 while (buflen--) {
3305 if (!shift) {
3306 t = BIGLITTLE(*n++,*--n);
3307 shift = 16;
3308 }
3309 shift -= 8;
3310 *buf++ = (unsigned char)(t>>shift);
3311 }
3312 }
3313
3314 /*
3315 * Merge a big-endian array of bytes into a bignum array.
3316 * The array had better be big enough. This is
3317 * equivalent to extracting the entire bignum into a
3318 * large byte array, copying the input buffer into the
3319 * middle of it, and converting back to a bignum.
3320 *
3321 * The buf is "len" bytes long, and its *last* byte is at
3322 * position "lsbyte" from the end of the bignum.
3323 *
3324 * Note that this is a pain to get right. Fortunately, it's hardly
3325 * critical for efficiency.
3326 */
3327 void
lbnInsertBigBytes_16(BNWORD16 * n,unsigned char const * buf,unsigned lsbyte,unsigned buflen)3328 lbnInsertBigBytes_16(BNWORD16 *n, unsigned char const *buf,
3329 unsigned lsbyte, unsigned buflen)
3330 {
3331 BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
3332
3333 lsbyte += buflen;
3334
3335 BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
3336
3337 /* Load up leading odd bytes */
3338 if (lsbyte % (16/8)) {
3339 t = BIGLITTLE(*--n,*n++);
3340 t >>= (lsbyte * 8) % 16;
3341 }
3342
3343 /* The main loop - merge into t, storing at each word boundary. */
3344 while (buflen--) {
3345 t = (t << 8) | *buf++;
3346 if ((--lsbyte % (16/8)) == 0)
3347 BIGLITTLE(*n++,*--n) = t;
3348 }
3349
3350 /* Merge odd bytes in t into last word */
3351 lsbyte = (lsbyte * 8) % 16;
3352 if (lsbyte) {
3353 t <<= lsbyte;
3354 t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3355 BIGLITTLE(n[0],n[-1]) = t;
3356 }
3357
3358 return;
3359 }
3360
3361 /*
3362 * Returns a substring of the little-endian array of bytes representation
3363 * of the bignum array based on two parameters, the least significant
3364 * byte number (0 to start with the least significant byte) and the
3365 * length. I.e. the number returned is a representation of
3366 * (bn / 2^(8*lsbyte)) % 2 ^ (8*buflen).
3367 *
3368 * It is an error if the bignum is not at least buflen + lsbyte bytes
3369 * long.
3370 *
3371 * This code assumes that the compiler has the minimal intelligence
3372 * neded to optimize divides and modulo operations on an unsigned data
3373 * type with a power of two.
3374 */
3375 void
lbnExtractLittleBytes_16(BNWORD16 const * n,unsigned char * buf,unsigned lsbyte,unsigned buflen)3376 lbnExtractLittleBytes_16(BNWORD16 const *n, unsigned char *buf,
3377 unsigned lsbyte, unsigned buflen)
3378 {
3379 BNWORD16 t = 0; /* Needed to shut up uninitialized var warnings */
3380
3381 BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
3382
3383 if (lsbyte % (16/8)) {
3384 t = BIGLITTLE(*--n,*n++);
3385 t >>= (lsbyte % (16/8)) * 8 ;
3386 }
3387
3388 while (buflen--) {
3389 if ((lsbyte++ % (16/8)) == 0)
3390 t = BIGLITTLE(*--n,*n++);
3391 *buf++ = (unsigned char)t;
3392 t >>= 8;
3393 }
3394 }
3395
3396 /*
3397 * Merge a little-endian array of bytes into a bignum array.
3398 * The array had better be big enough. This is
3399 * equivalent to extracting the entire bignum into a
3400 * large byte array, copying the input buffer into the
3401 * middle of it, and converting back to a bignum.
3402 *
3403 * The buf is "len" bytes long, and its first byte is at
3404 * position "lsbyte" from the end of the bignum.
3405 *
3406 * Note that this is a pain to get right. Fortunately, it's hardly
3407 * critical for efficiency.
3408 */
3409 void
lbnInsertLittleBytes_16(BNWORD16 * n,unsigned char const * buf,unsigned lsbyte,unsigned buflen)3410 lbnInsertLittleBytes_16(BNWORD16 *n, unsigned char const *buf,
3411 unsigned lsbyte, unsigned buflen)
3412 {
3413 BNWORD16 t = 0; /* Shut up uninitialized varibale warnings */
3414
3415 /* Move to most-significant end */
3416 lsbyte += buflen;
3417 buf += buflen;
3418
3419 BIGLITTLE(n -= lsbyte/(16/8), n += lsbyte/(16/8));
3420
3421 /* Load up leading odd bytes */
3422 if (lsbyte % (16/8)) {
3423 t = BIGLITTLE(*--n,*n++);
3424 t >>= (lsbyte * 8) % 16;
3425 }
3426
3427 /* The main loop - merge into t, storing at each word boundary. */
3428 while (buflen--) {
3429 t = (t << 8) | *--buf;
3430 if ((--lsbyte % (16/8)) == 0)
3431 BIGLITTLE(*n++,*--n) = t;
3432 }
3433
3434 /* Merge odd bytes in t into last word */
3435 lsbyte = (lsbyte * 8) % 16;
3436 if (lsbyte) {
3437 t <<= lsbyte;
3438 t |= (((BNWORD16)1 << lsbyte) - 1) & BIGLITTLE(n[0],n[-1]);
3439 BIGLITTLE(n[0],n[-1]) = t;
3440 }
3441
3442 return;
3443 }
3444
3445 #ifdef DEADCODE /* This was a precursor to the more flexible lbnExtractBytes */
3446 /*
3447 * Convert a big-endian array of bytes to a bignum.
3448 * Returns the number of words in the bignum.
3449 * Note the expression "16/8" for the number of bytes per word.
3450 * This is so the word-size adjustment will work.
3451 */
3452 unsigned
lbnFromBytes_16(BNWORD16 * a,unsigned char const * b,unsigned blen)3453 lbnFromBytes_16(BNWORD16 *a, unsigned char const *b, unsigned blen)
3454 {
3455 BNWORD16 t;
3456 unsigned alen = (blen + (16/8-1))/(16/8);
3457 BIGLITTLE(a -= alen, a += alen);
3458
3459 while (blen) {
3460 t = 0;
3461 do {
3462 t = t << 8 | *b++;
3463 } while (--blen & (16/8-1));
3464 BIGLITTLE(*a++,*--a) = t;
3465 }
3466 return alen;
3467 }
3468 #endif
3469
3470 #if 0
3471 /*
3472 * Computes the GCD of a and b. Modifies both arguments; when it returns,
3473 * one of them is the GCD and the other is trash. The return value
3474 * indicates which: 0 for a, and 1 for b. The length of the retult is
3475 * returned in rlen. Both inputs must have one extra word of precision.
3476 * alen must be >= blen.
3477 *
3478 * TODO: use the binary algorithm (Knuth section 4.5.2, algorithm B).
3479 * This is based on taking out common powers of 2, then repeatedly:
3480 * gcd(2*u,v) = gcd(u,2*v) = gcd(u,v) - isolated powers of 2 can be deleted.
3481 * gcd(u,v) = gcd(u-v,v) - the numbers can be easily reduced.
3482 * It gets less reduction per step, but the steps are much faster than
3483 * the division case.
3484 */
3485 int
3486 lbnGcd_16(BNWORD16 *a, unsigned alen, BNWORD16 *b, unsigned blen,
3487 unsigned *rlen)
3488 {
3489 #if BNYIELD
3490 int y;
3491 #endif
3492 assert(alen >= blen);
3493
3494 while (blen != 0) {
3495 (void)lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3496 alen = lbnNorm_16(a, blen);
3497 if (alen == 0) {
3498 *rlen = blen;
3499 return 1;
3500 }
3501 (void)lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3502 blen = lbnNorm_16(b, alen);
3503 #if BNYIELD
3504 if (bnYield && (y = bnYield()) < 0)
3505 return y;
3506 #endif
3507 }
3508 *rlen = alen;
3509 return 0;
3510 }
3511
3512 /*
3513 * Invert "a" modulo "mod" using the extended Euclidean algorithm.
3514 * Note that this only computes one of the cosequences, and uses the
3515 * theorem that the signs flip every step and the absolute value of
3516 * the cosequence values are always bounded by the modulus to avoid
3517 * having to work with negative numbers.
3518 * gcd(a,mod) had better equal 1. Returns 1 if the GCD is NOT 1.
3519 * a must be one word longer than "mod". It is overwritten with the
3520 * result.
3521 * TODO: Use Richard Schroeppel's *much* faster algorithm.
3522 */
3523 int
3524 lbnInv_16(BNWORD16 *a, unsigned alen, BNWORD16 const *mod, unsigned mlen)
3525 {
3526 BNWORD16 *b; /* Hold a copy of mod during GCD reduction */
3527 BNWORD16 *p; /* Temporary for products added to t0 and t1 */
3528 BNWORD16 *t0, *t1; /* Inverse accumulators */
3529 BNWORD16 cy;
3530 unsigned blen, t0len, t1len, plen;
3531 int y;
3532
3533 alen = lbnNorm_16(a, alen);
3534 if (!alen)
3535 return 1; /* No inverse */
3536
3537 mlen = lbnNorm_16(mod, mlen);
3538
3539 assert (alen <= mlen);
3540
3541 /* Inverse of 1 is 1 */
3542 if (alen == 1 && BIGLITTLE(a[-1],a[0]) == 1) {
3543 lbnZero_16(BIGLITTLE(a-alen,a+alen), mlen-alen);
3544 return 0;
3545 }
3546
3547 /* Allocate a pile of space */
3548 LBNALLOC(b, BNWORD16, mlen+1);
3549 if (b) {
3550 /*
3551 * Although products are guaranteed to always be less than the
3552 * modulus, it can involve multiplying two 3-word numbers to
3553 * get a 5-word result, requiring a 6th word to store a 0
3554 * temporarily. Thus, mlen + 1.
3555 */
3556 LBNALLOC(p, BNWORD16, mlen+1);
3557 if (p) {
3558 LBNALLOC(t0, BNWORD16, mlen);
3559 if (t0) {
3560 LBNALLOC(t1, BNWORD16, mlen);
3561 if (t1)
3562 goto allocated;
3563 LBNFREE(t0, mlen);
3564 }
3565 LBNFREE(p, mlen+1);
3566 }
3567 LBNFREE(b, mlen+1);
3568 }
3569 return -1;
3570
3571 allocated:
3572
3573 /* Set t0 to 1 */
3574 t0len = 1;
3575 BIGLITTLE(t0[-1],t0[0]) = 1;
3576
3577 /* b = mod */
3578 lbnCopy_16(b, mod, mlen);
3579 /* blen = mlen (implicitly) */
3580
3581 /* t1 = b / a; b = b % a */
3582 cy = lbnDiv_16(t1, b, mlen, a, alen);
3583 *(BIGLITTLE(t1-(mlen-alen)-1,t1+(mlen-alen))) = cy;
3584 t1len = lbnNorm_16(t1, mlen-alen+1);
3585 blen = lbnNorm_16(b, alen);
3586
3587 /* while (b > 1) */
3588 while (blen > 1 || BIGLITTLE(b[-1],b[0]) != (BNWORD16)1) {
3589 /* q = a / b; a = a % b; */
3590 if (alen < blen || (alen == blen && lbnCmp_16(a, a, alen) < 0))
3591 assert(0);
3592 cy = lbnDiv_16(BIGLITTLE(a-blen,a+blen), a, alen, b, blen);
3593 *(BIGLITTLE(a-alen-1,a+alen)) = cy;
3594 plen = lbnNorm_16(BIGLITTLE(a-blen,a+blen), alen-blen+1);
3595 assert(plen);
3596 alen = lbnNorm_16(a, blen);
3597 if (!alen)
3598 goto failure; /* GCD not 1 */
3599
3600 /* t0 += q * t1; */
3601 assert(plen+t1len <= mlen+1);
3602 lbnMul_16(p, BIGLITTLE(a-blen,a+blen), plen, t1, t1len);
3603 plen = lbnNorm_16(p, plen + t1len);
3604 assert(plen <= mlen);
3605 if (plen > t0len) {
3606 lbnZero_16(BIGLITTLE(t0-t0len,t0+t0len), plen-t0len);
3607 t0len = plen;
3608 }
3609 cy = lbnAddN_16(t0, p, plen);
3610 if (cy) {
3611 if (t0len > plen) {
3612 cy = lbnAdd1_16(BIGLITTLE(t0-plen,t0+plen),
3613 t0len-plen, cy);
3614 }
3615 if (cy) {
3616 BIGLITTLE(t0[-t0len-1],t0[t0len]) = cy;
3617 t0len++;
3618 }
3619 }
3620
3621 /* if (a <= 1) return a ? t0 : FAIL; */
3622 if (alen <= 1 && BIGLITTLE(a[-1],a[0]) == (BNWORD16)1) {
3623 if (alen == 0)
3624 goto failure; /* FAIL */
3625 assert(t0len <= mlen);
3626 lbnCopy_16(a, t0, t0len);
3627 lbnZero_16(BIGLITTLE(a-t0len, a+t0len), mlen-t0len);
3628 goto success;
3629 }
3630
3631 /* q = b / a; b = b % a; */
3632 if (blen < alen || (blen == alen && lbnCmp_16(b, a, alen) < 0))
3633 assert(0);
3634 cy = lbnDiv_16(BIGLITTLE(b-alen,b+alen), b, blen, a, alen);
3635 *(BIGLITTLE(b-blen-1,b+blen)) = cy;
3636 plen = lbnNorm_16(BIGLITTLE(b-alen,b+alen), blen-alen+1);
3637 assert(plen);
3638 blen = lbnNorm_16(b, alen);
3639 if (!blen)
3640 goto failure; /* GCD not 1 */
3641
3642 /* t1 += q * t0; */
3643 assert(plen+t0len <= mlen+1);
3644 lbnMul_16(p, BIGLITTLE(b-alen,b+alen), plen, t0, t0len);
3645 plen = lbnNorm_16(p, plen + t0len);
3646 assert(plen <= mlen);
3647 if (plen > t1len) {
3648 lbnZero_16(BIGLITTLE(t1-t1len,t1+t1len), plen-t1len);
3649 t1len = plen;
3650 }
3651 cy = lbnAddN_16(t1, p, plen);
3652 if (cy) {
3653 if (t1len > plen) {
3654 cy = lbnAdd1_16(BIGLITTLE(t1-plen,t0+plen),
3655 t1len-plen, cy);
3656 }
3657 if (cy) {
3658 BIGLITTLE(t1[-t1len-1],t1[t1len]) = cy;
3659 t1len++;
3660 }
3661 }
3662 #if BNYIELD
3663 if (bnYield && (y = bnYield() < 0))
3664 goto yield;
3665 #endif
3666 }
3667
3668 if (!blen)
3669 goto failure; /* gcd(a, mod) != 1 -- FAIL */
3670
3671 /* return mod-t1 */
3672 lbnCopy_16(a, mod, mlen);
3673 assert(t1len <= mlen);
3674 cy = lbnSubN_16(a, t1, t1len);
3675 if (cy) {
3676 assert(mlen > t1len);
3677 cy = lbnSub1_16(BIGLITTLE(a-t1len, a+t1len), mlen-t1len, cy);
3678 assert(!cy);
3679 }
3680
3681 success:
3682 LBNFREE(t1, mlen);
3683 LBNFREE(t0, mlen);
3684 LBNFREE(p, mlen+1);
3685 LBNFREE(b, mlen+1);
3686
3687 return 0;
3688
3689 failure: /* GCD is not 1 - no inverse exists! */
3690 y = 1;
3691 #if BNYIELD
3692 yield:
3693 #endif
3694 LBNFREE(t1, mlen);
3695 LBNFREE(t0, mlen);
3696 LBNFREE(p, mlen+1);
3697 LBNFREE(b, mlen+1);
3698
3699 return y;
3700 }
3701
3702 /*
3703 * Precompute powers of "a" mod "mod". Compute them every "bits"
3704 * for "n" steps. This is sufficient to compute powers of g with
3705 * exponents up to n*bits bits long, i.e. less than 2^(n*bits).
3706 *
3707 * This assumes that the caller has already initialized "array" to point
3708 * to "n" buffers of size "mlen".
3709 */
3710 int
3711 lbnBasePrecompBegin_16(BNWORD16 **array, unsigned n, unsigned bits,
3712 BNWORD16 const *g, unsigned glen, BNWORD16 *mod, unsigned mlen)
3713 {
3714 BNWORD16 *a, *b; /* Temporary double-width accumulators */
3715 BNWORD16 *a1; /* Pointer to high half of a*/
3716 BNWORD16 inv; /* Montgomery inverse of LSW of mod */
3717 BNWORD16 *t;
3718 unsigned i;
3719
3720 glen = lbnNorm_16(g, glen);
3721 assert(glen);
3722
3723 assert (mlen == lbnNorm_16(mod, mlen));
3724 assert (glen <= mlen);
3725
3726 /* Allocate two temporary buffers, and the array slots */
3727 LBNALLOC(a, BNWORD16, mlen*2);
3728 if (!a)
3729 return -1;
3730 LBNALLOC(b, BNWORD16, mlen*2);
3731 if (!b) {
3732 LBNFREE(a, 2*mlen);
3733 return -1;
3734 }
3735
3736 /* Okay, all ready */
3737
3738 /* Convert n to Montgomery form */
3739 inv = BIGLITTLE(mod[-1],mod[0]); /* LSW of modulus */
3740 assert(inv & 1); /* Modulus must be odd */
3741 inv = lbnMontInv1_16(inv);
3742 /* Move g up "mlen" words into a (clearing the low mlen words) */
3743 a1 = BIGLITTLE(a-mlen,a+mlen);
3744 lbnCopy_16(a1, g, glen);
3745 lbnZero_16(a, mlen);
3746
3747 /* Do the division - dump the quotient into the high-order words */
3748 (void)lbnDiv_16(a1, a, mlen+glen, mod, mlen);
3749
3750 /* Copy the first value into the array */
3751 t = *array;
3752 lbnCopy_16(t, a, mlen);
3753 a1 = a; /* This first value is *not* shifted up */
3754
3755 /* Now compute the remaining n-1 array entries */
3756 assert(bits);
3757 assert(n);
3758 while (--n) {
3759 i = bits;
3760 do {
3761 /* Square a1 into b1 */
3762 lbnMontSquare_16(b, a1, mod, mlen, inv);
3763 t = b; b = a; a = t;
3764 a1 = BIGLITTLE(a-mlen, a+mlen);
3765 } while (--i);
3766 t = *++array;
3767 lbnCopy_16(t, a1, mlen);
3768 }
3769
3770 /* Hooray, we're done. */
3771 LBNFREE(b, 2*mlen);
3772 LBNFREE(a, 2*mlen);
3773 return 0;
3774 }
3775
3776 /*
3777 * result = base^exp (mod mod). "array" is a an array of pointers
3778 * to procomputed powers of base, each 2^bits apart. (I.e. array[i]
3779 * is base^(2^(i*bits))).
3780 *
3781 * The algorithm consists of:
3782 * a = b = (powers of g to be raised to the power 2^bits-1)
3783 * a *= b *= (powers of g to be raised to the power 2^bits-2)
3784 * ...
3785 * a *= b *= (powers of g to be raised to the power 1)
3786 *
3787 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3788 */
3789 int
3790 lbnBasePrecompExp_16(BNWORD16 *result, BNWORD16 const * const *array,
3791 unsigned bits, BNWORD16 const *exp, unsigned elen,
3792 BNWORD16 const *mod, unsigned mlen)
3793 {
3794 BNWORD16 *a, *b, *c, *t;
3795 BNWORD16 *a1, *b1;
3796 int anull, bnull; /* Null flags: values are implicitly 1 */
3797 unsigned i, j; /* Loop counters */
3798 unsigned mask; /* Exponent bits to examime */
3799 BNWORD16 const *eptr; /* Pointer into exp */
3800 BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
3801 BNWORD16 inv; /* Inverse of LSW of modulus */
3802 unsigned ewords; /* Words of exponent left */
3803 int bufbits; /* Number of valid bits */
3804 int y = 0;
3805
3806 mlen = lbnNorm_16(mod, mlen);
3807 assert (mlen);
3808
3809 elen = lbnNorm_16(exp, elen);
3810 if (!elen) {
3811 lbnZero_16(result, mlen);
3812 BIGLITTLE(result[-1],result[0]) = 1;
3813 return 0;
3814 }
3815 /*
3816 * This could be precomputed, but it's so cheap, and it would require
3817 * making the precomputation structure word-size dependent.
3818 */
3819 inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
3820
3821 assert(elen);
3822
3823 /*
3824 * Allocate three temporary buffers. The current numbers generally
3825 * live in the upper halves of these buffers.
3826 */
3827 LBNALLOC(a, BNWORD16, mlen*2);
3828 if (a) {
3829 LBNALLOC(b, BNWORD16, mlen*2);
3830 if (b) {
3831 LBNALLOC(c, BNWORD16, mlen*2);
3832 if (c)
3833 goto allocated;
3834 LBNFREE(b, 2*mlen);
3835 }
3836 LBNFREE(a, 2*mlen);
3837 }
3838 return -1;
3839
3840 allocated:
3841
3842 anull = bnull = 1;
3843
3844 mask = (1u<<bits) - 1;
3845 for (i = mask; i; --i) {
3846 /* Set up bit buffer for walking the exponent */
3847 eptr = exp;
3848 buf = BIGLITTLE(*--eptr, *eptr++);
3849 ewords = elen-1;
3850 bufbits = 16;
3851 for (j = 0; ewords || buf; j++) {
3852 /* Shift down current buffer */
3853 curbits = buf;
3854 buf >>= bits;
3855 /* If necessary, add next word */
3856 bufbits -= bits;
3857 if (bufbits < 0 && ewords > 0) {
3858 nextword = BIGLITTLE(*--eptr, *eptr++);
3859 ewords--;
3860 curbits |= nextword << (bufbits+bits);
3861 buf = nextword >> -bufbits;
3862 bufbits += 16;
3863 }
3864 /* If appropriate, multiply b *= array[j] */
3865 if ((curbits & mask) == i) {
3866 BNWORD16 const *d = array[j];
3867
3868 b1 = BIGLITTLE(b-mlen-1,b+mlen);
3869 if (bnull) {
3870 lbnCopy_16(b1, d, mlen);
3871 bnull = 0;
3872 } else {
3873 lbnMontMul_16(c, b1, d, mod, mlen, inv);
3874 t = c; c = b; b = t;
3875 }
3876 #if BNYIELD
3877 if (bnYield && (y = bnYield() < 0))
3878 goto yield;
3879 #endif
3880 }
3881 }
3882
3883 /* Multiply a *= b */
3884 if (!bnull) {
3885 a1 = BIGLITTLE(a-mlen-1,a+mlen);
3886 b1 = BIGLITTLE(b-mlen-1,b+mlen);
3887 if (anull) {
3888 lbnCopy_16(a1, b1, mlen);
3889 anull = 0;
3890 } else {
3891 lbnMontMul_16(c, a1, b1, mod, mlen, inv);
3892 t = c; c = a; a = t;
3893 }
3894 }
3895 }
3896
3897 assert(!anull); /* If it were, elen would have been 0 */
3898
3899 /* Convert out of Montgomery form and return */
3900 a1 = BIGLITTLE(a-mlen-1,a+mlen);
3901 lbnCopy_16(a, a1, mlen);
3902 lbnZero_16(a1, mlen);
3903 lbnMontReduce_16(a, mod, mlen, inv);
3904 lbnCopy_16(result, a1, mlen);
3905
3906 #if BNYIELD
3907 yield:
3908 #endif
3909 LBNFREE(c, 2*mlen);
3910 LBNFREE(b, 2*mlen);
3911 LBNFREE(a, 2*mlen);
3912
3913 return y;
3914 }
3915
3916 /*
3917 * result = base1^exp1 *base2^exp2 (mod mod). "array1" and "array2" are
3918 * arrays of pointers to procomputed powers of the corresponding bases,
3919 * each 2^bits apart. (I.e. array1[i] is base1^(2^(i*bits))).
3920 *
3921 * Bits must be the same in both. (It could be made adjustable, but it's
3922 * a bit of a pain. Just make them both equal to the larger one.)
3923 *
3924 * The algorithm consists of:
3925 * a = b = (powers of base1 and base2 to be raised to the power 2^bits-1)
3926 * a *= b *= (powers of base1 and base2 to be raised to the power 2^bits-2)
3927 * ...
3928 * a *= b *= (powers of base1 and base2 to be raised to the power 1)
3929 *
3930 * All we do is walk the exponent 2^bits-1 times in groups of "bits" bits,
3931 */
3932 int
3933 lbnDoubleBasePrecompExp_16(BNWORD16 *result, unsigned bits,
3934 BNWORD16 const * const *array1, BNWORD16 const *exp1, unsigned elen1,
3935 BNWORD16 const * const *array2, BNWORD16 const *exp2,
3936 unsigned elen2, BNWORD16 const *mod, unsigned mlen)
3937 {
3938 BNWORD16 *a, *b, *c, *t;
3939 BNWORD16 *a1, *b1;
3940 int anull, bnull; /* Null flags: values are implicitly 1 */
3941 unsigned i, j, k; /* Loop counters */
3942 unsigned mask; /* Exponent bits to examime */
3943 BNWORD16 const *eptr; /* Pointer into exp */
3944 BNWORD16 buf, curbits, nextword; /* Bit-buffer varaibles */
3945 BNWORD16 inv; /* Inverse of LSW of modulus */
3946 unsigned ewords; /* Words of exponent left */
3947 int bufbits; /* Number of valid bits */
3948 int y = 0;
3949 BNWORD16 const * const *array;
3950
3951 mlen = lbnNorm_16(mod, mlen);
3952 assert (mlen);
3953
3954 elen1 = lbnNorm_16(exp1, elen1);
3955 if (!elen1) {
3956 return lbnBasePrecompExp_16(result, array2, bits, exp2, elen2,
3957 mod, mlen);
3958 }
3959 elen2 = lbnNorm_16(exp2, elen2);
3960 if (!elen2) {
3961 return lbnBasePrecompExp_16(result, array1, bits, exp1, elen1,
3962 mod, mlen);
3963 }
3964 /*
3965 * This could be precomputed, but it's so cheap, and it would require
3966 * making the precomputation structure word-size dependent.
3967 */
3968 inv = lbnMontInv1_16(mod[BIGLITTLE(-1,0)]); /* LSW of modulus */
3969
3970 assert(elen1);
3971 assert(elen2);
3972
3973 /*
3974 * Allocate three temporary buffers. The current numbers generally
3975 * live in the upper halves of these buffers.
3976 */
3977 LBNALLOC(a, BNWORD16, mlen*2);
3978 if (a) {
3979 LBNALLOC(b, BNWORD16, mlen*2);
3980 if (b) {
3981 LBNALLOC(c, BNWORD16, mlen*2);
3982 if (c)
3983 goto allocated;
3984 LBNFREE(b, 2*mlen);
3985 }
3986 LBNFREE(a, 2*mlen);
3987 }
3988 return -1;
3989
3990 allocated:
3991
3992 anull = bnull = 1;
3993
3994 mask = (1u<<bits) - 1;
3995 for (i = mask; i; --i) {
3996 /* Walk each exponent in turn */
3997 for (k = 0; k < 2; k++) {
3998 /* Set up the exponent for walking */
3999 array = k ? array2 : array1;
4000 eptr = k ? exp2 : exp1;
4001 ewords = (k ? elen2 : elen1) - 1;
4002 /* Set up bit buffer for walking the exponent */
4003 buf = BIGLITTLE(*--eptr, *eptr++);
4004 bufbits = 16;
4005 for (j = 0; ewords || buf; j++) {
4006 /* Shift down current buffer */
4007 curbits = buf;
4008 buf >>= bits;
4009 /* If necessary, add next word */
4010 bufbits -= bits;
4011 if (bufbits < 0 && ewords > 0) {
4012 nextword = BIGLITTLE(*--eptr, *eptr++);
4013 ewords--;
4014 curbits |= nextword << (bufbits+bits);
4015 buf = nextword >> -bufbits;
4016 bufbits += 16;
4017 }
4018 /* If appropriate, multiply b *= array[j] */
4019 if ((curbits & mask) == i) {
4020 BNWORD16 const *d = array[j];
4021
4022 b1 = BIGLITTLE(b-mlen-1,b+mlen);
4023 if (bnull) {
4024 lbnCopy_16(b1, d, mlen);
4025 bnull = 0;
4026 } else {
4027 lbnMontMul_16(c, b1, d, mod, mlen, inv);
4028 t = c; c = b; b = t;
4029 }
4030 #if BNYIELD
4031 if (bnYield && (y = bnYield() < 0))
4032 goto yield;
4033 #endif
4034 }
4035 }
4036 }
4037
4038 /* Multiply a *= b */
4039 if (!bnull) {
4040 a1 = BIGLITTLE(a-mlen-1,a+mlen);
4041 b1 = BIGLITTLE(b-mlen-1,b+mlen);
4042 if (anull) {
4043 lbnCopy_16(a1, b1, mlen);
4044 anull = 0;
4045 } else {
4046 lbnMontMul_16(c, a1, b1, mod, mlen, inv);
4047 t = c; c = a; a = t;
4048 }
4049 }
4050 }
4051
4052 assert(!anull); /* If it were, elen would have been 0 */
4053
4054 /* Convert out of Montgomery form and return */
4055 a1 = BIGLITTLE(a-mlen-1,a+mlen);
4056 lbnCopy_16(a, a1, mlen);
4057 lbnZero_16(a1, mlen);
4058 lbnMontReduce_16(a, mod, mlen, inv);
4059 lbnCopy_16(result, a1, mlen);
4060
4061 #if BNYIELD
4062 yield:
4063 #endif
4064 LBNFREE(c, 2*mlen);
4065 LBNFREE(b, 2*mlen);
4066 LBNFREE(a, 2*mlen);
4067
4068 return y;
4069 }
4070 #endif
4071