1/*
2 * alg_config - help determine optimal values for algorithm levels
3 *
4 * Copyright (C) 2006,2014,2021  Landon Curt Noll
5 *
6 * Calc is open software; you can redistribute it and/or modify it under
7 * the terms of the version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * Calc is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 * or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU Lesser General
13 * Public License for more details.
14 *
15 * A copy of version 2.1 of the GNU Lesser General Public License is
16 * distributed with calc under the filename COPYING-LGPL.  You should have
17 * received a copy with calc; if not, write to Free Software Foundation, Inc.
18 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
19 *
20 * Under source code control:	2006/06/07 14:10:11
21 * File existed as early as:	2006
22 *
23 * chongo <was here> /\oo/\	http://www.isthe.com/chongo/
24 * Share and enjoy!  :-)	http://www.isthe.com/chongo/tech/comp/calc/
25 */
26
27static test_time;	/* try for this many seconds in loop test */
28
29
30/*
31 * close_to_one - set to 1 if the ratio is close enough to 1
32 *
33 * given:
34 *	ratio	the ratio of time between two algorithms
35 *
36 * returns:
37 *	1	When ratio is near 1.0
38 *	0	otherwise
39 *
40 * We consider the range [0.995, 1.005] to be close enough to 1 to be call unity
41 * because of the precision of the CPU timing.
42 */
43define close_to_one(ratio)
44{
45    /* firewall */
46    if (!isreal(ratio)) {
47	quit "close: 1st arg: must be a real number";
48    }
49
50    /* check if the ratio is far from unity */
51    if ((ratio < 0.995) || (ratio > 1.005)) {
52	return 0;
53    }
54
55    /* we are close to unity */
56    return 1;
57}
58
59
60/*
61 * mul_loop - measure the CPU time to perform a set of multiply loops
62 *
63 * given:
64 *	repeat	number of multiply loops to perform
65 *	x	array of 5 values, each the same length in BASEB-bit words
66 *
67 *		NOTE: When their lengths are 1 BASEB-bit word, then a
68 *		      dummy loop of simple constants are used.  Thus the
69 *		      length == 1 is an approximation of loop overhead.
70 *
71 * returns:
72 *	approximate runtime to perform repeat the multiply loops
73 *
74 * NOTE: This is an internal support function that is normally
75 *	 not called directly from the command line.  Call the
76 *	 function best_mul2() instead.
77 */
78define mul_loop(repeat, x)
79{
80    local start;	/* start of execution */
81    local end;		/* end of execution */
82    local answer;	/* multiplicand */
83    local len;		/* length of each element */
84    local baseb_bytes;	/* bytes in a BASEB-bit word */
85    local i;
86
87    /* firewall */
88    if (!isint(repeat) || repeat < 0) {
89	quit "mul_loop: 1st arg: repeat must be an integer > 0";
90    }
91    if (size(*x) != 5) {
92	quit "mul_loop: 2nd arg matrix does not have 5 elements";
93    }
94    if (matdim(*x) != 1) {
95	quit "mul_loop: 2nd arg matrix is not 1 dimensional";
96    }
97    if (matmin(*x, 1) != 0) {
98	quit "mul_loop: 2nd arg matrix index range does not start with 0";
99    }
100    if (matmax(*x, 1) != 4) {
101	quit "mul_loop: 2nd arg matrix index range does not end with 4";
102    }
103
104    baseb_bytes = config("baseb") / 8;
105    len = sizeof((*x)[0]) / baseb_bytes;
106    for (i=1; i < 4; ++i) {
107	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
108	    quit "mul_loop: 2nd arg matrix elements are not of "
109	         "equal BASEB-bit word length";
110	}
111    }
112
113    /* multiply pairwise, all sets of a given length */
114    start = usertime();
115    for (i=0; i < repeat; ++i) {
116
117	if (len == 1) {
118	    /* we use len == 1 to test this tester loop overhead */
119	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
120	    /**/
121	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
122	    /**/
123	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
124	    /**/
125	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
126	    /**/
127	    answer = 0 * 0; answer = 0 * 0; answer = 0 * 0; answer = 0 * 0;
128	} else {
129	    answer = (*x)[0] * (*x)[1];
130	    answer = (*x)[0] * (*x)[2];
131	    answer = (*x)[0] * (*x)[3];
132	    answer = (*x)[0] * (*x)[4];
133	    /**/
134	    answer = (*x)[1] * (*x)[0];
135	    answer = (*x)[1] * (*x)[2];
136	    answer = (*x)[1] * (*x)[3];
137	    answer = (*x)[1] * (*x)[4];
138	    /**/
139	    answer = (*x)[2] * (*x)[0];
140	    answer = (*x)[2] * (*x)[1];
141	    answer = (*x)[2] * (*x)[3];
142	    answer = (*x)[2] * (*x)[4];
143	    /**/
144	    answer = (*x)[3] * (*x)[0];
145	    answer = (*x)[3] * (*x)[1];
146	    answer = (*x)[3] * (*x)[2];
147	    answer = (*x)[3] * (*x)[4];
148	    /**/
149	    answer = (*x)[4] * (*x)[0];
150	    answer = (*x)[4] * (*x)[1];
151	    answer = (*x)[4] * (*x)[2];
152	    answer = (*x)[4] * (*x)[3];
153	}
154    }
155
156    /*
157     * return duration
158     */
159    end = usertime();
160    return end-start;
161}
162
163
164/*
165 * mul_ratio - ratio of rates of 1st and 2nd multiply algorithms
166 *
167 * given:
168 *	len	length in BASEB-bit words to multiply
169 *
170 * return:
171 *	ratio of (1st / 2nd) algorithm rate.
172 *
173 * When want to determine a rate to a precision of 1 part in 1000.
174 * Most systems today return CPU time to at least 10 msec precision.
175 * So to get rates to that precision, we need to time loops to at
176 * least 1000 times as long as the precision (10 msec * 1000)
177 * which usually requires timing of loops that last 10 seconds or more.
178 *
179 * NOTE: This is an internal support function that is normally
180 *	 not called directly from the command line.  Call the
181 *	 function best_mul2() instead.
182 */
183define mul_ratio(len)
184{
185    local mat x[5];		/* array of values for mul_loop to multiply */
186    local mat one[5];		/* array if single BASEB-bit values */
187    local baseb;		/* calc word size in bits */
188    local orig_cfg;		/* caller configuration */
189    local loops;		/* number of multiply loops to time */
190    local tlen;			/* time to perform some number of loops */
191    local tover;		/* est of time for loop overhead */
192    local alg1_rate;		/* loop rate of 1st algorithm */
193    local alg2_rate;		/* loop rate of 2nd algorithm */
194    local ret;			/* return ratio, or 1.0 */
195    local i;
196
197    /*
198     * firewall
199     */
200    if (!isint(len) || len < 2) {
201	quit "mul_ratio: 1st arg: len is not an integer > 1";
202    }
203
204    /*
205     * remember the caller's config state
206     */
207    orig_cfg = config("all");
208    config("mul2", 0),;
209    config("sq2", 0),;
210    config("pow2", 0),;
211    config("redc2", 0),;
212    config("tilde", 0),;
213
214    /*
215     * initialize x, the values we will multiply
216     *
217     * We want these tests to be repeatable as possible, so we will seed
218     * the PRNG in a deterministic way.
219     */
220    baseb = config("baseb");
221    srand(sha1(sha1(baseb, config("version"))));
222    for (i=0; i < 5; ++i) {
223	/* force the values to be a full len words long */
224	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
225		    randbit(((len-1) * baseb) + baseb-2));
226	/* single BASEB-bit values */
227        one[i] = 1;
228    }
229
230    /*
231     * determine the number of loops needed to test 1st alg
232     */
233    config("mul2", 2^31-1),;
234    loops = 1/2;
235    do {
236	loops *= 2;
237	tlen = mul_loop(loops, &x);
238	if (config("user_debug") > 3) {
239	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
240	}
241    } while (tlen < 1.0);
242
243    /*
244     * determine the 1st algorithm rate
245     */
246    loops = max(1, ceil(loops * test_time / tlen));
247    if (loops < 16) {
248	if (config("user_debug") > 1) {
249	    printf("    we must expand alg1 loop test time to about %d secs\n",
250		ceil(test_time * (16 / loops)));
251	}
252	loops = 16;
253    }
254    if (config("user_debug") > 3) {
255	printf("\t    will try alg1 %d loops\n", loops);
256    }
257    tlen = mul_loop(loops, &x);
258    if (config("user_debug") > 3) {
259	printf("\t    alg1 time = %.3f secs\n", tlen);
260    }
261    tover = mul_loop(loops, &one);
262    if (config("user_debug") > 3) {
263	printf("\t    alg1 overhead look %.3f secs\n", tover);
264    }
265    if (tlen <= tover) {
266	quit "mul_ratio: overhead >= loop time";
267    }
268    alg1_rate = loops / (tlen - tover);
269    if (config("user_debug") > 2) {
270	printf("\tmultiply alg1 rate = %.3f loopsets/sec\n", alg1_rate);
271    }
272    if (alg1_rate <= 0.0) {
273	quit "mul_ratio: alg1 rate was <= 0.0";
274    }
275
276    /*
277     * determine the number of loops needed to test 1st alg
278     */
279    config("mul2", 2),;
280    loops = 1/2;
281    do {
282	loops *= 2;
283	tlen = mul_loop(loops, &x);
284	if (config("user_debug") > 3) {
285	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
286	}
287    } while (tlen < 1.0);
288
289    /*
290     * determine the 2nd algorithm rate
291     */
292    loops = max(1, ceil(loops * test_time / tlen));
293    if (loops < 16) {
294	if (config("user_debug") > 1) {
295	    printf("    we must expand alg2 loop test time to about %d secs\n",
296		ceil(test_time * (16 / loops)));
297	}
298	loops = 16;
299    }
300    tlen = mul_loop(loops, &x);
301    if (config("user_debug") > 3) {
302	printf("\t    alg2 time = %.3f secs\n", tlen);
303    }
304    tover = mul_loop(loops, &one);
305    if (config("user_debug") > 3) {
306	printf("\t    alg2 overhead look %.3f secs\n", tover);
307    }
308    if (tlen <= tover) {
309	quit "mul_ratio: overhead >= loop time";
310    }
311    alg2_rate = loops / (tlen - tover);
312    if (config("user_debug") > 2) {
313	printf("\tmultiply alg2 rate = %.3f loopsets/sec\n", alg2_rate);
314    }
315    if (alg2_rate <= 0.0) {
316	quit "mul_ratio: alg2 rate was <= 0.0";
317    }
318
319    /*
320     * restore old config
321     */
322    config("all", orig_cfg),;
323
324    /*
325     * return alg1 / alg2 rate ratio
326     */
327    ret = alg1_rate / alg2_rate;
328    if (config("user_debug") > 2) {
329	printf("\tprecise ratio is: %.f mul_ratio will return: %.3f\n",
330		alg1_rate / alg2_rate, ret);
331    }
332    return ret;
333}
334
335
336/*
337 * best_mul2 - determine the best config("mul2") parameter
338 *
339 * NOTE: Due to precision problems with CPU measurements, it is not
340 *	 unusual for the output of this function to vary slightly
341 *	 from run to run.
342 *
343 * NOTE: This function is designed to take a long time to run.
344 *	  We recommend setting:
345 *
346 *		config("user_debug", 2)
347 *
348 *	  so that yon can watch the progress of this function.
349 */
350define best_mul2()
351{
352    local ratio;	/* previously calculated alg1/alg2 ratio */
353    local low;		/* low loop value tested */
354    local high;		/* high loop value tested */
355    local mid;		/* between low and high */
356    local best_val;	/* value found with ratio closest to unity */
357    local best_ratio;	/* closest ratio found to unity */
358    local expand;	/* how fast to expand the length */
359
360    /*
361     * setup
362     */
363    printf("WARNING: This tool may not be computing the correct best value\n");
364    test_time = 5.0;
365    printf("The best_mul2() function will take a LONG time to run!\n");
366    printf("It is important that best_mul2() run on an otherwise idle host!\n");
367    if (config("user_debug") <= 0) {
368	printf("To monitor progress, set user_debug to 2: "
369	       "config(\"user_debug\", 2)\n");
370    }
371    printf("Starting with loop test time of %d secs\n", test_time);
372
373    /*
374     * firewall - must have a >1 ratio for the initial length
375     */
376    high = 8;
377    best_val = high;
378    if (config("user_debug") > 0) {
379	printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
380    }
381    ratio = mul_ratio(high);
382    best_ratio = ratio;
383    if (config("user_debug") > 1) {
384	printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
385    }
386    if (ratio < 1.0) {
387	quit "best_mul2: tests imply mul2 should be < 16, which seems bogus";
388    }
389
390    /*
391     * expand lengths until the ratio flips
392     */
393    do {
394	/*
395	 * determine the parameters of the next ratio test
396	 *
397	 * We will multiplicatively expand our test level until
398	 * the ratio drops below 1.0.
399	 */
400	expand = 2;
401	low = high;
402	high *= expand;
403	if (config("user_debug") > 1) {
404	    printf("    expand the next test range by a factor of %d\n",
405	           expand);
406	}
407
408	/*
409	 * determine the alg1/alg2 test ratio for this new length
410	 */
411	if (high >= 2^31) {
412	    quit "best_mul2: test implies mul2 >= 2^31, which seems bogus";
413	}
414	if (config("user_debug") > 0) {
415	    printf("testing multiply alg1/alg2 ratio for len = %d\n", high);
416	}
417	ratio = mul_ratio(high);
418	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
419	    best_val = high;
420	    best_ratio = ratio;
421	    if (config("user_debug") > 1) {
422		printf("    len %d has a new closest ratio to unity: %.6f\n",
423		       best_val, best_ratio);
424	    }
425	}
426	if (config("user_debug") > 1) {
427	    printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
428	}
429    } while (ratio > 1.0);
430
431    /*
432     * If we previously expanded more than by a factor of 2, then
433     * we may have jumped over the crossover point.  So now
434     * drop down powers of two until the ratio is again >= 1.0
435     */
436    if (expand > 2) {
437	do {
438
439	    /*
440	     * contract by 2
441	     */
442	    high /= 2;
443	    low = high / 2;
444	    if (config("user_debug") > 0) {
445		printf("re-testing multiply alg1/alg2 ratio for len = %d\n",
446		       high);
447	    }
448	    ratio = mul_ratio(high);
449	    if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
450		best_val = high;
451		best_ratio = ratio;
452		if (config("user_debug") > 1) {
453		    printf("    len %d has a new closest ratio "
454			   "to unity: %.6f\n",
455			   best_val, best_ratio);
456		}
457	    }
458	    if (config("user_debug") > 1) {
459		printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
460	    }
461
462	} while (ratio <= 1.0);
463
464	/* now that the ratio flipped again, use the previous range */
465	low = high;
466	high = high * 2;
467    }
468    if (config("user_debug") > 0) {
469	printf("Starting binary search between %d and %d\n", low, high);
470    }
471
472    /*
473     * binary search between low and high, for where ratio is just under 1.0
474     */
475    while (low+1 < high) {
476
477	/* try the mid-point */
478	mid = int((low+high)/2);
479	if (config("user_debug") > 0) {
480	    printf("testing multiply alg1/alg2 ratio for len = %d\n", mid);
481	}
482	ratio = mul_ratio(mid);
483	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
484	    best_val = mid;
485	    best_ratio = ratio;
486	    if (config("user_debug") > 1) {
487		printf("    len %d has a new closest ratio to unity: %.6f\n",
488		       best_val, best_ratio);
489	    }
490	}
491	if (config("user_debug") > 1) {
492	    printf("    len %d multiply alg1/alg2 ratio = %.6f\n", mid, ratio);
493	}
494
495	/* stop search if near unity */
496	if (close_to_one(ratio)) {
497	    low = mid;
498	    high = mid;
499	    if (config("user_debug") > 0) {
500		printf("\twe are close enough to unity ratio at: %d\n", mid);
501	    }
502	    break;
503	}
504
505	/* bump lower range up if we went over */
506	if (ratio > 1.0) {
507	    if (config("user_debug") > 2) {
508		printf("\tmove low from %d up to %d\n",
509		    low, mid);
510	    }
511	    low = mid;
512
513	/* drop higher range down if we went under */
514	} else {
515	    if (config("user_debug") > 2) {
516		printf("\tmove high from %d down to %d\n",
517		     high, mid);
518	    }
519	    high = mid;
520	}
521
522	/* report on test loop progress */
523	if (config("user_debug") > 1) {
524	    printf("\tsetting low: %d high: %d diff: %d\n",
525		   low, high, high-low);
526	}
527    }
528
529    /*
530     * return on the suggested config("mul2") value
531     */
532    if (config("user_debug") > 0) {
533	printf("Best value for multiply is near %d\n", best_val);
534	printf("Best multiply alg1/alg2 ratio is: %.6f\n", best_ratio);
535	printf("We suggest placing this line in your .calcrc:\n");
536	printf("config(\"mul2\", %d),;\n", best_val);
537	printf("WARNING: It is believed that the output "
538	       "of this resource file is bogus!\n");
539	printf("WARNING: You may NOT wish to follow the above suggestion.\n");
540    }
541    return mid;
542}
543
544
545/*
546 * sq_loop - measure the CPU time to perform a set of square loops
547 *
548 * given:
549 *	repeat	number of square loops to perform
550 *	x	array of 5 values, each the same length in BASEB-bit words
551 *
552 *		NOTE: When their lengths are 1 BASEB-bit word, then a
553 *		      dummy loop of simple constants are used.  Thus the
554 *		      length == 1 is an approximation of loop overhead.
555 * returns:
556 *	approximate runtime to perform a square loop
557 *
558 * NOTE: This is an internal support function that is normally
559 *	 not called directly from the command line.  Call the
560 *	 function best_sq2() instead.
561 */
562define sq_loop(repeat, x)
563{
564    local start;	/* start of execution */
565    local end;		/* end of execution */
566    local answer;	/* squared value */
567    local len;		/* length of each element */
568    local baseb_bytes;	/* bytes in a BASEB-bit word */
569    local i;
570
571    /* firewall */
572    if (!isint(repeat) || repeat < 0) {
573	quit "sq_loop: 1st arg: repeat must be an integer > 0";
574    }
575    if (size(*x) != 5) {
576	quit "sq_loop: 2nd arg matrix does not have 5 elements";
577    }
578    if (matdim(*x) != 1) {
579	quit "sq_loop: 2nd arg matrix is not 1 dimensional";
580    }
581    if (matmin(*x, 1) != 0) {
582	quit "sq_loop: 2nd arg matrix index range does not start with 0";
583    }
584    if (matmax(*x, 1) != 4) {
585	quit "sq_loop: 2nd arg matrix index range does not end with 4";
586    }
587    baseb_bytes = config("baseb") / 8;
588    len = sizeof((*x)[0]) / baseb_bytes;
589    for (i=1; i < 4; ++i) {
590	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
591	    quit "sq_loop: 2nd arg matrix elements are not of equal "
592	         "BASEB-bit word length";
593	}
594    }
595
596    /* square pairwise, all sets of a given length */
597    start = usertime();
598    for (i=0; i < repeat; ++i) {
599
600	if (len == 1) {
601	    /* we use len == 1 to test this tester loop overhead */
602	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
603	    answer = 0^2;
604	    /**/
605	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
606	    answer = 0^2;
607	    /**/
608	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
609	    answer = 0^2;
610	    /**/
611	    answer = 0^2; answer = 0^2; answer = 0^2; answer = 0^2;
612	    answer = 0^2;
613	} else {
614	    /* one square loop */
615	    answer = (*x)[0]^2;
616	    answer = (*x)[1]^2;
617	    answer = (*x)[2]^2;
618	    answer = (*x)[3]^2;
619	    answer = (*x)[4]^2;
620	    /**/
621	    answer = (*x)[0]^2;
622	    answer = (*x)[1]^2;
623	    answer = (*x)[2]^2;
624	    answer = (*x)[3]^2;
625	    answer = (*x)[4]^2;
626	    /**/
627	    answer = (*x)[0]^2;
628	    answer = (*x)[1]^2;
629	    answer = (*x)[2]^2;
630	    answer = (*x)[3]^2;
631	    answer = (*x)[4]^2;
632	    /**/
633	    answer = (*x)[0]^2;
634	    answer = (*x)[1]^2;
635	    answer = (*x)[2]^2;
636	    answer = (*x)[3]^2;
637	    answer = (*x)[4]^2;
638	}
639    }
640
641    /*
642     * return duration
643     */
644    end = usertime();
645    return end-start;
646}
647
648
649/*
650 * sq_ratio - ratio of rates of 1st and 2nd square algorithms
651 *
652 * given:
653 *	len	length in BASEB-bit words to square
654 *
655 * return:
656 *	ratio of (1st / 2nd) algorithm rates
657 *
658 * When want to determine a rate to a precision of 1 part in 1000.
659 * Most systems today return CPU time to at least 10 msec precision.
660 * So to get rates to that precision, we need to time loops to at
661 * least 1000 times as long as the precision (10 msec * 1000)
662 * which usually requires timing of loops that last 10 seconds or more.
663 *
664 * NOTE: This is an internal support function that is normally
665 *	 not called directly from the command line.  Call the
666 *	 function best_sq2() instead.
667 */
668define sq_ratio(len)
669{
670    local mat x[5];		/* array of values for sq_loop to square */
671    local mat one[5];		/* array if single BASEB-bit values */
672    local baseb;		/* calc word size in bits */
673    local orig_cfg;		/* caller configuration */
674    local loops;		/* number of square loops to time */
675    local tlen;			/* time to perform some number of loops */
676    local tover;		/* est of time for loop overhead */
677    local alg1_rate;		/* loop rate of 1st algorithm */
678    local alg2_rate;		/* loop rate of 2nd algorithm */
679    local ret;			/* return ratio, or 1.0 */
680    local i;
681
682    /*
683     * firewall
684     */
685    if (!isint(len) || len < 2) {
686	quit "sq_ratio: 1st arg: len is not an integer > 1";
687    }
688
689    /*
690     * remember the caller's config state
691     */
692    orig_cfg = config("all");
693    config("mul2", 0),;
694    config("sq2", 0),;
695    config("pow2", 0),;
696    config("redc2", 0),;
697    config("tilde", 0),;
698
699    /*
700     * initialize x, the values we will square
701     *
702     * We want these tests to be repeatable as possible, so we will seed
703     * the PRNG in a deterministic way.
704     */
705    baseb = config("baseb");
706    srand(sha1(sha1(baseb, config("version"))));
707    for (i=0; i < 5; ++i) {
708	/* force the values to be a full len words long */
709	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
710		    randbit(((len-1) * baseb) + baseb-2));
711	/* single BASEB-bit values */
712        one[i] = 1;
713    }
714
715    /*
716     * determine the number of loops needed to test 1st alg
717     */
718    config("sq2", 2^31-1),;
719    loops = 1/2;
720    do {
721	loops *= 2;
722	tlen = sq_loop(loops, &x);
723	if (config("user_debug") > 3) {
724	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
725	}
726    } while (tlen < 1.0);
727
728    /*
729     * determine the 1st algorithm rate
730     */
731    loops = max(1, ceil(loops * test_time / tlen));
732    if (loops < 16) {
733	if (config("user_debug") > 1) {
734	    printf("    we must expand alg1 loop test time to about %d secs\n",
735		ceil(test_time * (16 / loops)));
736	}
737	loops = 16;
738    }
739    tlen = sq_loop(loops, &x);
740    if (config("user_debug") > 3) {
741	printf("\t    alg1 time = %.3f secs\n", tlen);
742    }
743    tover = sq_loop(loops, &one);
744    if (config("user_debug") > 3) {
745	printf("\t    alg1 overhead look %.3f secs\n", tover);
746    }
747    if (tlen <= tover) {
748	quit "sq_ratio: overhead >= loop time";
749    }
750    alg1_rate = loops / (tlen - tover);
751    if (config("user_debug") > 2) {
752	printf("\tsquare alg1 rate = %.3f loopsets/sec\n", alg1_rate);
753    }
754    if (alg1_rate <= 0.0) {
755	quit "sq_ratio: alg1 rate was <= 0.0";
756    }
757
758    /*
759     * determine the number of loops needed to test 1st alg
760     */
761    config("sq2", 2),;
762    loops = 1/2;
763    do {
764	loops *= 2;
765	tlen = sq_loop(loops, &x);
766	if (config("user_debug") > 3) {
767	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
768	}
769    } while (tlen < 1.0);
770
771    /*
772     * determine the 2nd algorithm rate
773     */
774    loops = max(1, ceil(loops * test_time / tlen));
775    if (loops < 16) {
776	if (config("user_debug") > 1) {
777	    printf("    we must expand alg2 loop test time to about %d secs\n",
778		ceil(test_time * (16 / loops)));
779	}
780	loops = 16;
781    }
782    tlen = sq_loop(loops, &x);
783    if (config("user_debug") > 3) {
784	printf("\t    alg2 time = %.3f secs\n", tlen);
785    }
786    tover = sq_loop(loops, &one);
787    if (config("user_debug") > 3) {
788	printf("\t    alg2 overhead look %.3f secs\n", tover);
789    }
790    if (tlen <= tover) {
791	quit "sq_ratio: overhead >= loop time";
792    }
793    alg2_rate = loops / (tlen - tover);
794    if (config("user_debug") > 2) {
795	printf("\tsquare alg2 rate = %.3f loopsets/sec\n", alg2_rate);
796    }
797    if (alg2_rate <= 0.0) {
798	quit "sq_ratio: alg2 rate was <= 0.0";
799    }
800
801    /*
802     * restore old config
803     */
804    config("all", orig_cfg),;
805
806    /*
807     * return alg1 / alg2 rate ratio
808     */
809    ret = alg1_rate / alg2_rate;
810    if (config("user_debug") > 2) {
811	printf("\tprecise ratio is: %.f sq_ratio will return: %.3f\n",
812		alg1_rate / alg2_rate, ret);
813    }
814    return ret;
815}
816
817
818/*
819 * best_sq2 - determine the best config("sq2") parameter
820 *
821 * NOTE: Due to precision problems with CPU measurements, it is not
822 *	 unusual for the output of this function to vary slightly
823 *	 from run to run.
824 *
825 * NOTE: This function is designed to take a long time to run.
826 *	  We recommend setting:
827 *
828 *		config("user_debug", 2)
829 *
830 *	  so that yon can watch the progress of this function.
831 */
832define best_sq2()
833{
834    local ratio;	/* previously calculated alg1/alg2 ratio */
835    local low;		/* low loop value tested */
836    local high;		/* high loop value tested */
837    local mid;		/* between low and high */
838    local best_val;	/* value found with ratio closest to unity */
839    local best_ratio;	/* closest ratio found to unity */
840    local expand;	/* how fast to expand the length */
841
842    /*
843     * setup
844     */
845    printf("WARNING: This tool may not be computing the correct best value\n");
846    test_time = 5.0;
847    printf("The best_sq2() function will take a LONG time to run!\n");
848    printf("It is important that best_sq2() run on an otherwise idle host!\n");
849    if (config("user_debug") <= 0) {
850	printf("To monitor progress, set user_debug to 2: "
851	       "config(\"user_debug\", 2)\n");
852    }
853    printf("Starting with loop test time of %d secs\n", test_time);
854
855    /*
856     * firewall - must have a >1 ratio for the initial length
857     */
858    high = 8;
859    best_val = high;
860    if (config("user_debug") > 0) {
861	printf("testing square alg1/alg2 ratio for len = %d\n", high);
862    }
863    ratio = sq_ratio(high);
864    best_ratio = ratio;
865    if (config("user_debug") > 1) {
866	printf("    square alg1/alg2 ratio = %.3f\n", ratio);
867    }
868    if (ratio < 1.0) {
869	quit "best_sq2: test implies sq2 < 16, which seems bogus";
870    }
871
872    /*
873     * expand lengths until the ratio flips
874     */
875    do {
876	/*
877	 * determine the parameters of the next ratio test
878	 *
879	 * We will multiplicatively expand our test level until
880	 * the ratio drops below 1.0.
881	 */
882	expand = 2;
883	low = high;
884	high *= expand;
885	if (config("user_debug") > 1) {
886	    printf("    expand the next test range by a factor of %d\n",
887		   expand);
888	}
889
890	/*
891	 * determine the alg1/alg2 test ratio for this new length
892	 */
893	if (high >= 2^31) {
894	    quit "best_sq2: tests imply sq2 >= 2^31, which seems bogus";
895	}
896	if (config("user_debug") > 0) {
897	    printf("testing square alg1/alg2 ratio for len = %d\n", high);
898	}
899	ratio = sq_ratio(high);
900	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
901	    best_val = high;
902	    best_ratio = ratio;
903	    if (config("user_debug") > 1) {
904		printf("    len %d has a new closest ratio to unity: %.6f\n",
905		       best_val, best_ratio);
906	    }
907	}
908	if (config("user_debug") > 1) {
909	    printf("    square alg1/alg2 ratio = %.3f\n", ratio);
910	}
911    } while (ratio > 1.0);
912
913    /*
914     * If we previously expanded more than by a factor of 2, then
915     * we may have jumped over the crossover point.  So now
916     * drop down powers of two until the ratio is again >= 1.0
917     */
918    if (expand > 2) {
919	do {
920
921	    /*
922	     * contract by 2
923	     */
924	    high /= 2;
925	    low = high / 2;
926	    if (config("user_debug") > 0) {
927		printf("re-testing multiply alg1/alg2 ratio for len = %d\n",
928		       high);
929	    }
930	    ratio = mul_ratio(high);
931	    if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
932		best_val = high;
933		best_ratio = ratio;
934		if (config("user_debug") > 1) {
935		    printf("    len %d has a new closest ratio "
936			   "to unity: %.6f\n",
937			   best_val, best_ratio);
938		}
939	    }
940	    if (config("user_debug") > 1) {
941		printf("    multiply alg1/alg2 ratio = %.6f\n", ratio);
942	    }
943
944	} while (ratio <= 1.0);
945
946	/* now that the ratio flipped again, use the previous range */
947	low = high;
948	high = high * 2;
949    }
950    if (config("user_debug") > 0) {
951	printf("Starting binary search between %d and %d\n", low, high);
952    }
953
954    /*
955     * binary search between low and high, for where ratio is just under 1.0
956     */
957    while (low+1 < high) {
958
959	/* try the mid-point */
960	mid = int((low+high)/2);
961	if (config("user_debug") > 0) {
962	    printf("testing square alg1/alg2 ratio for len = %d\n", mid);
963	}
964	ratio = sq_ratio(mid);
965	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
966	    best_val = mid;
967	    best_ratio = ratio;
968	    if (config("user_debug") > 1) {
969		printf("    len %d has a new closest ratio to unity: %.6f\n",
970		       best_val, best_ratio);
971	    }
972	}
973	if (config("user_debug") > 1) {
974	    printf("    len %d square alg1/alg2 ratio = %.6f\n", mid, ratio);
975	}
976
977	/* stop search if near unity */
978	if (close_to_one(ratio)) {
979	    low = mid;
980	    high = mid;
981	    if (config("user_debug") > 0) {
982		printf("\twe are close enough to unity ratio at: %d\n", mid);
983	    }
984	    break;
985	}
986
987	/* bump lower range up if we went over */
988	if (ratio > 1.0) {
989	    if (config("user_debug") > 2) {
990		printf("\tmove low from %d up to %d\n",
991		    low, mid);
992	    }
993	    low = mid;
994
995	/* drop higher range down if we went under */
996	} else {
997	    if (config("user_debug") > 2) {
998		printf("\tmove high from %d down to %d\n",
999		     high, mid);
1000	    }
1001	    high = mid;
1002	}
1003
1004	/* report on test loop progress */
1005	if (config("user_debug") > 1) {
1006	    printf("\tsetting low: %d high: %d diff: %d\n",
1007		   low, high, high-low);
1008	}
1009    }
1010
1011    /*
1012     * return on the suggested config("sq2") value
1013     */
1014    mid = int((low+high)/2);
1015    if (config("user_debug") > 0) {
1016	printf("Best value for square is near %d\n", best_val);
1017	printf("Best square alg1/alg2 ratio is: %.6f\n", best_ratio);
1018	printf("We suggest placing this line in your .calcrc:\n");
1019	printf("config(\"sq2\", %d),;\n", best_val);
1020	printf("WARNING: It is believed that the output "
1021	       "of this resource file is bogus!\n");
1022	printf("WARNING: You may NOT wish to follow the above suggestion.\n");
1023    }
1024    return mid;
1025}
1026
1027
1028/*
1029 * pow_loop - measure the CPU time to perform a set of pmod loops
1030 *
1031 * given:
1032 *	repeat	number of pmod loops to perform
1033 *	x	array of 5 values, each the same length in BASEB-bit words
1034 *
1035 *		NOTE: When their lengths are 1 BASEB-bit word, then a
1036 *		      dummy loop of simple constants are used.  Thus the
1037 *		      length == 1 is an approximation of loop overhead.
1038 *
1039 *	ex	exponent for pmod value
1040 *
1041 * returns:
1042 *	approximate runtime to perform a pmod loop
1043 *
1044 * NOTE: This is an internal support function that is normally
1045 *	 not called directly from the command line.  Call the
1046 *	 function best_pow2() instead.
1047 */
1048define pow_loop(repeat, x, ex)
1049{
1050    local start;	/* start of execution */
1051    local end;		/* end of execution */
1052    local answer;	/* pmod value */
1053    local len;		/* length of each element */
1054    local baseb_bytes;	/* bytes in a BASEB-bit word */
1055    local i;
1056
1057    /* firewall */
1058    if (!isint(repeat) || repeat < 0) {
1059	quit "pow_loop: 1st arg: repeat must be an integer > 0";
1060    }
1061    if (size(*x) != 5) {
1062	quit "pow_loop: 2nd arg matrix does not have 5 elements";
1063    }
1064    if (matdim(*x) != 1) {
1065	quit "pow_loop: 2nd arg matrix is not 1 dimensional";
1066    }
1067    if (matmin(*x, 1) != 0) {
1068	quit "pow_loop: 2nd arg matrix index range does not start with 0";
1069    }
1070    if (matmax(*x, 1) != 4) {
1071	quit "pow_loop: 2nd arg matrix index range does not end with 4";
1072    }
1073    baseb_bytes = config("baseb") / 8;
1074    len = sizeof((*x)[0]) / baseb_bytes;
1075    for (i=1; i < 4; ++i) {
1076	if ((sizeof((*x)[i]) / baseb_bytes) != len) {
1077	    quit "pow_loop: 2nd arg matrix elements are not of "
1078	         "equal BASEB-bit word length";
1079	}
1080    }
1081    if (!isint(ex) || ex < 3) {
1082	quit" pow_loop: 3rd arg ex is not an integer > 2";
1083    }
1084
1085    /* pmod pairwise, all sets of a given length */
1086    start = usertime();
1087    for (i=0; i < repeat; ++i) {
1088
1089	if (len == 1) {
1090	    /* we use len == 1 to test this tester loop overhead */
1091	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1092	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1093	    /**/
1094	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1095	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1096	    /**/
1097	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1098	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1099	    /**/
1100	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1101	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1102	    /**/
1103	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1104	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1105	    /**/
1106	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1107	    answer = pmod(0,0,0); answer = pmod(0,0,0);
1108	} else {
1109	    answer = pmod((*x)[0], ex, (*x)[1]);
1110	    answer = pmod((*x)[0], ex, (*x)[2]);
1111	    answer = pmod((*x)[0], ex, (*x)[3]);
1112	    answer = pmod((*x)[0], ex, (*x)[4]);
1113	    /**/
1114	    answer = pmod((*x)[1], ex, (*x)[0]);
1115	    answer = pmod((*x)[1], ex, (*x)[2]);
1116	    answer = pmod((*x)[1], ex, (*x)[3]);
1117	    answer = pmod((*x)[1], ex, (*x)[4]);
1118	    /**/
1119	    answer = pmod((*x)[2], ex, (*x)[0]);
1120	    answer = pmod((*x)[2], ex, (*x)[1]);
1121	    answer = pmod((*x)[2], ex, (*x)[3]);
1122	    answer = pmod((*x)[2], ex, (*x)[4]);
1123	    /**/
1124	    answer = pmod((*x)[3], ex, (*x)[0]);
1125	    answer = pmod((*x)[3], ex, (*x)[1]);
1126	    answer = pmod((*x)[3], ex, (*x)[2]);
1127	    answer = pmod((*x)[3], ex, (*x)[4]);
1128	    /**/
1129	    answer = pmod((*x)[4], ex, (*x)[0]);
1130	    answer = pmod((*x)[4], ex, (*x)[1]);
1131	    answer = pmod((*x)[4], ex, (*x)[2]);
1132	    answer = pmod((*x)[4], ex, (*x)[3]);
1133	}
1134    }
1135
1136    /*
1137     * return duration
1138     */
1139    end = usertime();
1140    return end-start;
1141}
1142
1143
1144/*
1145 * pow_ratio - ratio of rates of 1st and 2nd pmod algorithms
1146 *
1147 * given:
1148 *	len	length in BASEB-bit words to pmod
1149 *
1150 * return:
1151 *	ratio of (1st / 2nd) algorithm rates
1152 *
1153 * When want to determine a rate to a precision of 1 part in 1000.
1154 * Most systems today return CPU time to at least 10 msec precision.
1155 * So to get rates to that precision, we need to time loops to at
1156 * least 1000 times as long as the precision (10 msec * 1000)
1157 * which usually requires timing of loops that last 10 seconds or more.
1158 *
1159 * NOTE: This is an internal support function that is normally
1160 *	 not called directly from the command line.  Call the
1161 *	 function best_pow2() instead.
1162 */
1163define pow_ratio(len)
1164{
1165    local mat x[5];		/* array of values for pow_loop to pmod */
1166    local mat one[5];		/* array if single BASEB-bit values */
1167    local baseb;		/* calc word size in bits */
1168    local orig_cfg;		/* caller configuration */
1169    local loops;		/* number of pmod loops to time */
1170    local tlen;			/* time to perform some number of loops */
1171    local tover;		/* est of time for loop overhead */
1172    local alg1_rate;		/* loop rate of 1st algorithm */
1173    local alg2_rate;		/* loop rate of 2nd algorithm */
1174    local ex;			/* exponent to use in pow_loop() */
1175    local ret;			/* return ratio, or 1.0 */
1176    local i;
1177
1178    /*
1179     * firewall
1180     */
1181    if (!isint(len) || len < 2) {
1182	quit "pow_ratio: 1st arg: len is not an integer > 1";
1183    }
1184
1185    /*
1186     * remember the caller's config state
1187     */
1188    orig_cfg = config("all");
1189    config("mul2", 0),;
1190    config("sq2", 0),;
1191    config("pow2", 0),;
1192    config("redc2", 0),;
1193    config("tilde", 0),;
1194
1195    /*
1196     * setup
1197     */
1198    ex = 7;
1199
1200    /*
1201     * initialize x, the values we will pmod
1202     *
1203     * We want these tests to be repeatable as possible, so we will seed
1204     * the PRNG in a deterministic way.
1205     */
1206    baseb = config("baseb");
1207    srand(sha1(sha1(ex, baseb, config("version"))));
1208    for (i=0; i < 5; ++i) {
1209	/* force the values to be a full len words long */
1210	x[i] = ((1<<(((len-1) * baseb) + baseb-1)) |
1211		    randbit(((len-1) * baseb) + baseb-2));
1212	/* single BASEB-bit values */
1213        one[i] = 1;
1214    }
1215
1216    /*
1217     * determine the number of loops needed to test 1st alg
1218     */
1219    config("pow2", 2^31-1),;
1220    config("redc2", 2^31-1),;
1221    loops = 1/2;
1222    do {
1223	loops *= 2;
1224	tlen = pow_loop(loops, &x, ex);
1225	if (config("user_debug") > 3) {
1226	    printf("\t    alg1 loops %d took %.3f sec\n", loops, tlen);
1227	}
1228    } while (tlen < 1.0);
1229
1230    /*
1231     * determine the 1st algorithm rate
1232     */
1233    loops = max(1, ceil(loops * test_time / tlen));
1234    if (loops < 16) {
1235	if (config("user_debug") > 1) {
1236	    printf("    we must expand alg1 loop test time to about %d secs\n",
1237		ceil(test_time * (16 / loops)));
1238	}
1239	loops = 16;
1240    }
1241    tlen = pow_loop(loops, &x, ex);
1242    if (config("user_debug") > 3) {
1243	printf("\t    alg1 time = %.3f secs\n", tlen);
1244    }
1245    tover = pow_loop(loops, &one, ex);
1246    if (config("user_debug") > 3) {
1247	printf("\t    alg1 overhead look %.3f secs\n", tover);
1248    }
1249    if (tlen <= tover) {
1250	quit "pow_ratio: overhead >= loop time";
1251    }
1252    alg1_rate = loops / (tlen - tover);
1253    if (config("user_debug") > 2) {
1254	printf("\tpmod alg1 rate = %.3f loopsets/sec\n", alg1_rate);
1255    }
1256    if (alg1_rate <= 0.0) {
1257	quit "pow_ratio: alg1 rate was <= 0.0";
1258    }
1259
1260    /*
1261     * determine the number of loops needed to test 1st alg
1262     */
1263    config("pow2", 2),;
1264    config("redc2", 2^31-1),;
1265    loops = 1/2;
1266    do {
1267	loops *= 2;
1268	tlen = pow_loop(loops, &x, ex);
1269	if (config("user_debug") > 3) {
1270	    printf("\t    alg2 loops %d took %.3f sec\n", loops, tlen);
1271	}
1272    } while (tlen < 1.0);
1273
1274    /*
1275     * determine the 2nd algorithm rate
1276     */
1277    loops = max(1, ceil(loops * test_time / tlen));
1278    if (loops < 16) {
1279	if (config("user_debug") > 1) {
1280	    printf("    we must expand alg2 loop test time to about %d secs\n",
1281		ceil(test_time * (16 / loops)));
1282	}
1283	loops = 16;
1284    }
1285    tlen = pow_loop(loops, &x, ex);
1286    if (config("user_debug") > 3) {
1287	printf("\t    alg2 time = %.3f secs\n", tlen);
1288    }
1289    tover = pow_loop(loops, &one, ex);
1290    if (config("user_debug") > 3) {
1291	printf("\t    alg2 overhead look %.3f secs\n", tover);
1292    }
1293    if (tlen <= tover) {
1294	quit "pow_ratio: overhead >= loop time";
1295    }
1296    alg2_rate = loops / (tlen - tover);
1297    if (config("user_debug") > 2) {
1298	printf("\tpmod alg2 rate = %.3f loopsets/sec\n", alg2_rate);
1299    }
1300    if (alg2_rate <= 0.0) {
1301	quit "pow_ratio: alg2 rate was <= 0.0";
1302    }
1303
1304    /*
1305     * restore old config
1306     */
1307    config("all", orig_cfg),;
1308
1309    /*
1310     * return alg1 / alg2 rate ratio
1311     */
1312    ret = alg1_rate / alg2_rate;
1313    if (config("user_debug") > 2) {
1314	printf("\tprecise ratio is: %.f pow_ratio will return: %.3f\n",
1315		alg1_rate / alg2_rate, ret);
1316    }
1317    return ret;
1318}
1319
1320
1321/*
1322 * best_pow2 - determine the best config("pow2") parameter w/o REDC2
1323 *
1324 * NOTE: Due to precision problems with CPU measurements, it is not
1325 *	 unusual for the output of this function to vary slightly
1326 *	 from run to run.
1327 *
1328 * NOTE: This function is designed to take a long time to run.
1329 *	  We recommend setting:
1330 *
1331 *		config("user_debug", 2)
1332 *
1333 *	  so that yon can watch the progress of this function.
1334 */
1335define best_pow2()
1336{
1337    local ratio;	/* previously calculated alg1/alg2 ratio */
1338    local low;		/* low loop value tested */
1339    local high;		/* high loop value tested */
1340    local mid;		/* between low and high */
1341    local best_val;	/* value found with ratio closest to unity */
1342    local best_ratio;	/* closest ratio found to unity */
1343    local expand;	/* how fast to expand the length */
1344    local looped;	/* 1 ==> we have expanded lengths before */
1345
1346    /*
1347     * setup
1348     */
1349    printf("WARNING: This tool may not be computing the correct best value\n");
1350    test_time = 60.0;
1351    printf("The best_pow2() function will take a LONG time to run!\n");
1352    printf("It is important that best_pow2() run on an otherwise idle host!\n");
1353    if (config("user_debug") <= 0) {
1354	printf("To monitor progress, set user_debug to 2: "
1355	       "config(\"user_debug\", 2)\n");
1356    }
1357    printf("Starting with loop test time of %d secs\n", test_time);
1358
1359    /*
1360     * firewall - must have a >1.02 ratio for the initial length
1361     *
1362     * We select 1.02 because of the precision of the CPU timing.  We
1363     * want to first move into an area where the 1st algorithm clearly
1364     * dominates.
1365     */
1366    low = 4;
1367    high = 4;
1368    best_val = high;
1369    best_ratio = 1e10;	/* not a real value */
1370    do {
1371	high *= 4;
1372	if (config("user_debug") > 0) {
1373	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
1374	}
1375	ratio = pow_ratio(high);
1376	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
1377	    best_val = high;
1378	    best_ratio = ratio;
1379	    if (config("user_debug") > 1) {
1380		printf("    len %d has a new closest ratio to unity: %.6f\n",
1381		       best_val, best_ratio);
1382	    }
1383	}
1384	if (config("user_debug") > 1) {
1385	    printf("    pmod alg1/alg2 ratio = %.3f\n", ratio);
1386	    if (ratio > 1.0 && ratio <= 1.02) {
1387	    printf("    while alg1 is slightly better than alg2, "
1388		   "it is not clearly better\n");
1389	    }
1390	}
1391    } while (ratio <= 1.02);
1392    if (config("user_debug") > 0) {
1393	printf("starting the pow2 search above %d\n", high);
1394    }
1395
1396    /*
1397     * expand lengths until the ratio flips
1398     */
1399    looped = 0;
1400    do {
1401	/*
1402	 * determine the parameters of the next ratio test
1403	 *
1404	 * We will multiplicatively expand our test level until
1405	 * the ratio drops below 1.0.
1406	 *
1407	 * NOTE: At low lengths, the ratios seen to be very small
1408	 *	 so we force an expansion of 4 to speed us on our
1409	 *	 way to larger lengths.  At these somewhat larger
1410	 *	 lengths, the ratios usually don't get faster than
1411	 *	 1.25, so we need to expand force a more rapid
1412	 *	 expansion than normal.  At lengths longer than
1413	 *	 2k, the time to test becomes very long, so we
1414	 *	 want to slow down at these higher lengths.
1415	 */
1416	expand = 2;
1417	if (looped) {
1418	    low = high;
1419	}
1420	high *= expand;
1421	if (config("user_debug") > 1) {
1422	    printf("    expand the next test range by a factor of %d\n",
1423		   expand);
1424	}
1425
1426	/*
1427	 * determine the alg1/alg2 test ratio for this new length
1428	 */
1429	if (high >= 2^31) {
1430	    quit "best_pow2: test implies pow2 >= 2^31, which seems bogus";
1431	}
1432	if (config("user_debug") > 0) {
1433	    printf("testing pmod alg1/alg2 ratio for len = %d\n", high);
1434	}
1435	ratio = pow_ratio(high);
1436	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
1437	    best_val = high;
1438	    best_ratio = ratio;
1439	    if (config("user_debug") > 1) {
1440		printf("    len %d has a new closest ratio to unity: %.6f\n",
1441		       best_val, best_ratio);
1442	    }
1443	}
1444	if (config("user_debug") > 1) {
1445	    printf("    pmod alg1/alg2 ratio = %.6f\n", ratio);
1446	}
1447	looped = 1;
1448    } while (ratio > 1.0);
1449    if (config("user_debug") > 0) {
1450	printf("Starting binary search between %d and %d\n", low, high);
1451    }
1452
1453    /*
1454     * binary search between low and high, for where ratio is just under 1.0
1455     */
1456    while (low+1 < high) {
1457
1458	/* try the mid-point */
1459	mid = int((low+high)/2);
1460	if (config("user_debug") > 0) {
1461	    printf("testing pow2 alg1/alg2 ratio for len = %d\n", mid);
1462	}
1463	ratio = pow_ratio(mid);
1464	if (abs(ratio - 1.0) < abs(best_ratio - 1.0)) {
1465	    best_val = mid;
1466	    best_ratio = ratio;
1467	    if (config("user_debug") > 1) {
1468		printf("    len %d has a new closest ratio to unity: %.6f\n",
1469		       best_val, best_ratio);
1470	    }
1471	}
1472	if (config("user_debug") > 1) {
1473	    printf("    len %d pmod alg1/alg2 ratio = %.6f\n", mid, ratio);
1474	}
1475
1476	/* stop search if near unity */
1477	if (close_to_one(ratio)) {
1478	    low = mid;
1479	    high = mid;
1480	    if (config("user_debug") > 0) {
1481		printf("\twe are close enough to unity ratio at: %d\n", mid);
1482	    }
1483	    break;
1484	}
1485
1486	/* bump lower range up if we went over */
1487	if (ratio > 1.0) {
1488	    if (config("user_debug") > 2) {
1489		printf("\tmove low from %d up to %d\n",
1490		    low, mid);
1491	    }
1492	    low = mid;
1493
1494	/* drop higher range down if we went under */
1495	} else {
1496	    if (config("user_debug") > 2) {
1497		printf("\tmove high from %d down to %d\n",
1498		     high, mid);
1499	    }
1500	    high = mid;
1501	}
1502
1503	/* report on test loop progress */
1504	if (config("user_debug") > 1) {
1505	    printf("\tsetting low: %d high: %d diff: %d\n",
1506		   low, high, high-low);
1507	}
1508    }
1509
1510    /*
1511     * return on the suggested config("pow2") value
1512     */
1513    mid = int((low+high)/2);
1514    if (config("user_debug") > 0) {
1515	printf("Best value for pmod is near %d\n", best_val);
1516	printf("Best pmod alg1/alg2 ratio is: %.6f\n", best_ratio);
1517	printf("We suggest placing this line in your .calcrc:\n");
1518	printf("config(\"pow2\", %d),;\n", best_val);
1519	printf("WARNING: It is believed that the output "
1520	       "of this resource file is bogus!\n");
1521	printf("WARNING: You may NOT wish to follow the above suggestion.\n");
1522    }
1523    return mid;
1524}
1525