1;Last Modified:  16-APR-1992 09:06:30.46
2	.title  fprims  Fast Multiple Precision Primitives
3	.ident  /V1.7B/
4;+
5; **-FPRIMS-Fast Multiple Precision Primitives
6;
7; Facility:     PGP
8;
9; Language:     Macro-32
10;
11; Functional Description:
12;
13; This module contains fast multiple precision routines for operating on arrays
14; of long words. Error checking is minimised at the expense of speed.
15;
16; Restrictions:
17;
18; This code is shareable but NOT reentrant as written because of static data.
19; A reentrant version of this module could be written but it would be slower!
20;
21; Version:      1
22;
23; Original:     00A     Date:   17-Sep-1991     Author: Hugh A.J. Kennedy
24;
25; Based on FPRIMS.ASM written by Zhahai Stewart for the Intel 8086
26; architecture.
27;
28; Modification: 02A     Date:   27-Sep-1991     Author: Hugh A.J. Kennedy.
29;
30; Add fast multiply routine, P_SMUL.
31; Re-organise code slightly.
32; Ammend/clarify copyright and license statement.
33; Add checking for maximum precision exceeded, display a warning message
34; and bomb!
35;
36; Modification: 03A     Date:   16-Mar-1992     Author: Hugh A.J. Kennedy.
37;
38; Sniff for MSB in P_SMUL. In this way, avoid multiplies by leading zeroes
39; (not efficient).
40;
41; Modification: 05A     Date:   17-Mar-1992     Author: Hugh A.J. Kennedy.
42;
43; Encode entire double precision multiply in VAX assembler.
44; Correct some minor problems with handling embedded zeroes.
45;
46; Modification: 06A     Date:   17-Mar-1992     Author: Hugh A.J. Kennedy
47;
48; Align everything for speed. VAXen like stuff on 64-bit, or at least 32-bit
49; boundaries. Therefore, we align the add, subtract and rotate tables and then
50; we align the multiply loops. The extra NOPs used to pad these loops are of
51; negligable cost because they already exist in the memory buffer. When the
52; following instruction is aligned, it executes MUCH faster.
53;
54; Modification: 07A     Date:   24-Mar-1991     Author: Hugh A.J. Kennedy.
55;
56; Implement fast compare.
57;-
58
59	.sbttl  Copyright Notice And License To Use
60;
61;                 Copyright (c) 1991-1992, All Rights Reserved by
62;                            Hugh A.J. Kennedy.
63;
64;       A license to use and adapt this software without payment is hereby
65;       granted subject to the following conditions:
66;
67;       1) It may only be copied with the inclusion of this copyright
68;       notice in the program source with these associated conditions.
69;
70;       2) No title to or ownership of this software is hereby
71;       transferred.
72;
73;       3) The  information  in this software is subject  to change
74;       without notice and should  not be construed as a  commitment  by
75;       Hugh Kennedy.
76;
77;       4) The author assumes no liability for any damages arising from the
78;       use of this software, even if said damages  arises from defects in
79;       this software.
80;
81;       5) No warranty as to merchantability or fitness of purpose is
82;       expressed or implied.
83;
84;       6) Any modifications to this source must be clearly identified as
85;       such and added to the modification history.
86;
87;       7) These routines may not be incorporated in a commercial cryptographic
88;       product.
89;
90; If you can not comply with these conditions, you *must* contact the author
91; and obtain permission other wise you are in violation of copyright.
92
93	.sbttl  Misc Macros & Definitions
94;
95; Assembly Parameters
96;
97max_unit_prec   =       72                      ; Maximum unit precision
98supersniffer    =       1                       ; Enable bit msb locator.
99;
100; The following parameter is dependent on the kind of VAX you are running on
101; and should be defined if the execution time of the SOBGTR loop control
102; instruction and the appropriate operation (ADWC or SBWC) from cache is much
103; less than the execution time in main memory. If you have a slow VAX you
104; should comment the following line out to use a vector of instructions.
105;
106novector        =       1                       ; Use loops rather than vectors.
107
108.macro  ascid   .string
109;+
110; *-ASCID-Build An ASCII String Referenced By Descriptor
111;
112; Functional Description:
113;
114;       This macro is a little like the system supplied .ASCID directive
115; but it uses a separate program section to store the ASCII data.
116;
117; Arguments:
118;
119;       STRING          String to create
120;-
121	.nocross
122
123	.save_psect
124
125	.psect  puret
126
127$$$t0   =       .
128	.ascii  @.string@
129$$$t1   =       .-$$$t0
130
131	.restore_psect
132
133	.word   $$$t1
134	.byte   dsc$k_dtype_t
135	.byte   dsc$k_class_s
136	.address -
137		$$$t0
138	.cross
139
140.endm   ascid
141
142	.sbttl  Misc Data Areas
143;
144; Misc. Data Areas
145;
146	.psect  impurd,con,lcl,noshr,exe,rd,wrt,long
147
148;
149; This data is static and is used to hold the current precision established
150; by P_SETP for other calls to this library.
151;
152.if not_defined novector
153
154addoff:                                         ; Offset into add table.
155	.blkl   1                               ; also for sub and rot.
156.endc
157
158precis:                                         ; Precision in longwords.
159	.blkl   1
160
161	.psect  pure,con,rel,shr,exe,rd,nowrt,quad
162
163	.align  quad
164
165.if not_defined novector
166
167prectoobig:
168	ascid   <PGP (FPRIMS) - Requested precision (!ZL) exceeds capacity (!ZL)>
169
170.endc
171
172	.sbttl  Start of Code
173
174	.sbttl  P_CMP           Compare two very long integers
175;+
176; **-P_COMP-Compare two very long integers
177;
178; Functional Description:
179;
180; This procedure is invoked to compare two extended precision unsigned
181; integers.
182;
183; Calling Sequence
184;
185;       short P_CMP ( r1, r2)
186;
187; Parameters:
188;
189;       R1      ->      Extended Precision Integer 1
190;       R2      ->      Extended Precision Integer 2
191;
192; Implicit Inputs:
193;
194;       PRECIS          lr*r    Precision expresses in longs.
195;
196; Returns:
197;
198;       -1      if r1 < r2
199;        0      if r1 = r2
200;       +1      if r1 > r2
201;
202
203;-
204
205	.align  long
206
207	.entry  p_cmp,^m<r2>
208
209	movl    4(ap),r1                        ; R1 -> Sum.
210	movl    8(ap),r2                        ; R2 -> Addend.
211	movl    precis,r0                       ; R0 = Precision.
212	moval   (r1)[r0],r1                     ; Get MS longwords.
213	moval   (r2)[r0],r2                     ; Get MS longwords.
214.align  long    1                               ; Align loop with NOPS.
21510$:    cmpl    -(r1),-(r2)                     ; Compare.
216	bnequ   20$                             ; If ne, then exit loop.
217	sobgtr  r0,10$                          ; Loop until done.
218	ret                                     ; R0 = zero so R1 = R2.
21920$:
220	bgtru   30$                             ; If R1 > R2 then branch.
221	movw    #-1,r0                          ; Flag <.
222	ret
22330$:
224	movw    #1,r0                           ; Flag >.
225	ret
226
227	.sbttl  P_ADDC          Add two very long integers with carry
228;+
229; **-P_ADDC-Add very long integers
230;
231; Functional Description:
232;
233; This procedure is invoked to add two very long integers with carry. Each
234; integer is represented as an array of longwords, least significant first.
235;
236; Calling Sequence:
237;
238;       P_ADDC  sum,addend,carry
239;
240; Parameters:
241;
242;       sum             lm*r            Sum.
243;       addend          lr*r            Addend.
244;       carry           lr*v            Carry bit.
245;
246; Implicit Inputs:
247;
248;       Addoff          This is used as an offset into the various tables
249;                       of adds, subtracts and rotates to implement the
250;                       operation to the requested precsion.
251;
252; Status Returns:
253;
254;       R0      Resulting carry bit.
255;-
256
257	.align  long
258
259	.entry  p_addc,^m<r2,r3>
260
261	movl    4(ap),r1                        ; R1 -> Sum.
262	movl    8(ap),r2                        ; R2 -> Addend.
263
264.if defined novector
265
266	movl    precis,r3                       ; R3 = Precision.
267	subl3   12(ap),#0,r0                    ; Set carry bit.
268	.align  quad,1                          ; Align loop with NOPs
26910$:    adwc    (r2)+,(r1)+                     ; Add with carry one longword.
270	.align  quad,1                          ; Align next instruction.
271	sobgtr  r3,10$                          ; Loop until done.
272
273.iff ; novector
274
275	moval   10$,r3
276	addl2   addoff,r3                       ; Jump into table.
277	subl3   12(ap),#0,r0                    ; Set carry bit.
278	jmp     (r3)
279
280	.align  quad
281
28210$:
283	.rept   max_unit_prec
284$$$     =       .
285	adwc    (r2)+,(r1)+                     ; Add with carry one longword.
286	nop
287addsiz  =       .-$$$
288	.endr
289
290.endc ; novector
291
292	clrl    r0                              ; Assume carry clear.
293	bcc     20$                             ; Carry set?
294	incl    r0                              ; Flag carry was set.
29520$:    ret
296
297	.sbttl  P_SUBB  Subtract very long integers with borrow
298;+
299; **-P_SUBB-Subtract very long integers
300;
301; Functional Description:
302;
303; This procedure is invoked to add subtract very long integers with carry. Each
304; integer is represented as an array of longwords, least significant first.
305;
306; Calling Sequence:
307;
308;       P_SUBB  diff,sub,borrow
309;
310; Parameters:
311;
312;       diff            lm*r            Difference
313;       sub             lr*r            Subtrahend.
314;       borrow          lr*v            Borrow bit.
315;
316; Implicit Inputs:
317;
318;       Addoff          This is used as an offset into the various tables
319;                       of adds, subtracts and rotates to implement the
320;                       operation to the requested precsion.
321;
322; Status Returns:
323;
324;       R0      Resulting carry bit.
325;-
326
327	.align  long
328
329	.entry  p_subb,^m<r2,r3>
330
331	movl    4(ap),r1                        ; R1 -> Difference.
332	movl    8(ap),r2                        ; R2 -> Minuend.
333
334.if defined novector
335
336	movl    precis,r3                       ; R3 = No. of longs.
337	subl3   12(ap),#0,r0                    ; Set borrow bit.
338	.align  quad,1                          ; Align loop with NOPs.
33910$:    sbwc    (r2)+,(r1)+                     ; Subtract with borrow one long.
340	.align  quad,1                          ; Align with NOPs.
341	sobgtr  r3,10$                          ; Loop through.
342
343.iff ; novector
344
345	moval   10$,r3
346	addl2   addoff,r3                       ; Jump into table.
347	subl3   12(ap),#0,r0                    ; Set borrow bit.
348	jmp     (r3)
349
350	.align  quad
35110$:
352	.rept   max_unit_prec
353	sbwc    (r2)+,(r1)+                     ; Subtract w/carry one longword.
354	nop
355	.endr
356
357.endc ; novector
358
359	clrl    r0                              ; Assume carry clear.
360	bcc     20$                             ; Carry set?
361	incl    r0                              ; Flag carry was set.
36220$:    ret
363
364	.sbttl  P_ROTL  Rotate left a very long integer with carry.
365;+
366; **-P_ROTL-Rotate left one bit very long integers
367;
368; Functional Description:
369;
370; This procedure is invoked to rotate left one bit (e.g. divide by 2) very
371; long integers with carry. Each integer is represented as an array of
372; longwords, least significant first. Note that we use the add with carry
373; instruction here because the VAX (unlike the dear old PDP-11) lacks a
374; rotate instruction that includes the carry bit.
375;
376; Calling Sequence:
377;
378;       P_ROTL  num,carry
379;
380; Parameters:
381;
382;       num             lm*r            Number to be shifted
383;       carry           lr*v            Carry bit.
384;
385; Implicit Inputs:
386;
387;       Addoff          This is used as an offset into the various tables
388;                       of adds, subtracts and rotates to implement the
389;                       operation to the requested precsion.
390;
391; Status Returns:
392;
393;       R0      Resulting carry bit.
394;-
395
396	.align  long
397
398	.entry  p_rotl,^m<r3>
399
400	movl    4(ap),r1                        ; R1 -> Sum.
401
402.if defined novector
403
404	movl    precis,r3                       ; R3 = No. of longwords.
405	subl3   8(ap),#0,r0                     ; Set carry bit.
406	.align  quad,1                          ; Align loop with NOPs
40710$:    adwc    (r1),(r1)+                      ; Add to itself with carry.
408	.align  quad,1                          ; Align with NOPs.
409	sobgtr  r3,10$                          ; Loop until done.
410
411.iff ; novector
412
413	moval   10$,r3
414	addl2   addoff,r3                       ; Jump into table.
415	subl3   8(ap),#0,r0                     ; Set carry bit.
416	jmp     (r3)
417
418	.align  quad
41910$:
420	.rept   max_unit_prec
421	adwc    (r1),(r1)+                      ; *2+carry.
422	nop
423	.endr
424
425.endc ; novector
426	clrl    r0                              ; Assume carry clear.
427	bcc     20$                             ; Carry set?
428	incl    r0                              ; Flag carry was set.
42920$:    ret
430
431	.sbttl  P_DMUL  Extended Multiple Precision Multiply
432;*
433; **-P_DMUL-Extended Multiple Precision Multiply
434;
435; Functional Description:
436;
437; This procedure multiplies an unsigned single precision multiplier by a
438; single precision multiplicand. The product register is double precision.
439; It is expected that the length of the single precision multiplier and
440; multiplicand has been previously set by a call to P_SETP. Note that the
441; entire length of the product register is zeroed - so it must be a full
442; double precision size.
443;
444; Calling Sequence:
445;
446;       P_DMUL  prod, multiplicand, multiplier
447;
448; Parameters:
449;
450;       prod            lw*r    Product.
451;       multuplicand    lr*r    Multiplicand
452;       multiplier      lr*r    Multiplier
453;
454; Implicit Inputs:
455;
456;       PRECIS          lr*r    Precision expresses in longs.
457;
458; Status Returns:
459;
460;       None.
461;-
462
463	.align  long
464
465	.entry  p_dmul,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
466
467	movl    4(ap),r8                        ; R8 -> Product.
468	beql    49$                             ; If eq, not specified.
469	movl    precis,r10                      ; R10 = Precision (longs)
470	ashl    #3,r10,r2                       ; R0 = No. of bytes to zero.
471	movc5   #0,#0,#0,r2,(r8)                ; Zero product buffer.
472	movl    8(ap),r3                        ; R3 -> Multiplicand.
473	beql    49$                             ; If eq, not specified.
474	pushl   r3                              ; Save for posterity.
475	movl    12(ap),r11                      ; R11 -> Multiplier.
476	beql    49$                             ; If eq, not specified.
477	movl    r10,r12                         ; R12 = Multiplicand prec.
478
479.if     defined SUPERSNIFFER
480
481;
482; Here we calculate the effective maximum precision for the multiply by
483; locating the long containing the most significant bit of the multiplier
484; and the multiplicand.
485;
486	moval   (r11)[r10],r0                   ; Supersniffer...
487	.align  quad,1                          ; Align with nops
48845$:    tstl    -(r0)                           ; Examine next long.
489	bneq    50$                             ; If ne, then we found msb.
490	sobgtr  r10,45$                         ; Loop until done.
49149$:    ret                                     ; Multiplier = 0!
49250$:
493	moval   (r3)[r12],r0                    ; Supersniffer...
494	.align  quad,1                          ; Align with nops
49555$:    tstl    -(r0)                           ; Examine next long.
496	bneq    200$                            ; If ne, then we found msb.
497	sobgtr  r12,55$                         ; Loop until done.
498	ret                                     ; Multiplicand = 0!
499.iff
500
501	brb     200$
50249$:
503	ret
504
505.endc   ; SUPERSNIFFER
506
507;
508; Multiplier Loop
509;
510; R12 = Count of multiplicand longs to process.
511; R11 -> Next long of multiplier.
512; R10 = Count of multiplier longs to process.
513; R8 -> Next long of product.
514;
515	.align  quad,1                          ; Align with nops
516200$:   movl    r12,r5                          ; Multiplicand precision.
517	moval   (r8)+,r4                        ; R4 -> Next long of product.
518	movl    (sp),r3                         ; R3 -> 1st multiplier long.
519	movl    (r4),r0                         ; R0,R1 = Partial Sum.
520	movl    4(r4),r1
521	clrl    r7                              ; Zero look-ahead carry.
522;
523; Perform an extended multiply of two unsigned numbers. This means that
524; we have to compensate the hi-order product because either the multiplier
525; or the multiplicand may be apparently a negative number. EMUL is a signed
526; multiply - so we must be careful. Also, the EMUL longword addend is sign
527; extended before adding into the product so we have to add the hard way.
528;
529; R6 =          Current Multiplicand
530; R2 =          Multiplier
531; R4 ->         Current quadword of partial product.
532; R0,R1 =       Partial sum to which product is added
533; R7 =          Lookahead carry. This gets set if we try to carry after adding
534;               the partial product to the partial sum. This gets a little more
535;               complicated because here we are setting the high-order long of
536;               the next quadword to be operated on.
537;
538; Essentially the algorithm is as follows:
539;
540; 0) R0,R1 = (R4)               ; Save current partial sum.
541; 1) R6 = Next longword of multiplicand.
542; 2) (R4) = R6 * R2             ; quad result compensating for negative numbers)
543; 3) (R4) = (R4) + R0,R1        ; add back partial sum.
544; 4) R7 = Carry bit.
545; 5) R4 = R4 + 4                ; Point to next long.
546; 6) R1 = 4(R4) + R7            ; Propagate carry to high order of next partial
547;                               ; sum.
548; 7) Loop back to step 1 until multiplicand completely processed.
549;
550	movl    (r11)+,r2                       ; R2 = Multiplier.
551	beql    999$                            ; If eq, not specified.
552	blss    1500$                           ; This unfolds the compensation
553						; test out of the loop.
554;
555; This version of the multiply loop is entered when the multiplier is positive
556; saving three instructions per unit of precision.
557;
558	.align  quad,1                          ; Align with NOPs.
559500$:   movl    (r3)+,r6                        ; R6 = Current multplicand.
560	emul    r2,r6,#0,(r4)                   ; Multiply (64-bit result).
561;
562; Because we have removed leading zeroes, multiplication by zero is very
563; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
564; the EMUL (looking at the zero product) that the multiplicand was zero so we
565; don't need any special case logic later to adjust the product pointer.
566;
567	beql    550$                            ; If result eq, skip.
568	tstl    r6                              ; Was multiplicand negative?
569	bgeq    550$                            ; No, skip.
570	addl2   r2,4(r4)                        ; Yes, compensate.
571550$:   addl2   r0,(r4)+                        ; Accumulate.
572	adwc    r1,(r4)+
573	movl    (r4),r1                         ; R1 = Next hi-end partial sum.
574	adwc    r7,r1                           ; Add carry if needed.
575	clrl    r7                              ; Reset lookahead register.
576	adwc    #0,r7                           ; Save lookahead carry.
577	movl    -(r4),r0                        ; R0 = Next lo-end partial.
578	sobgtr  r5,500$                         ; More units?
579999$:   sobgtr  r10,200$                        ; Nope, go get next multiplier
580	ret
581;
582; This version of the above multiply loop is entered when the multiplier is
583; negative - and we must compensate by adding the multiplicand to the hi-order
584; product. This saves a test and a conditional branch per unit of precision.
585;
586	.align  quad,1                          ; Align with NOPs.
5871500$:
588	movl    (r3)+,r6                        ; R6 = Current multplicand.
589	emul    r2,r6,#0,(r4)                   ; Multiply (64-bit result).
590;
591; Because we have removed leading zeroes, multiplication by zero is very
592; unlikely, 1 in 2^32 or so. It is therefore easier to perform the test after
593; the EMUL (looking at the zero product) that the multiplicand was zero so we
594; don't need any special case logic later to adjust the product pointer.
595;
596	beql    1560$                           ; If result eq, skip.
597	tstl    r6                              ; Was multiplicand negative?
598	bgeq    1550$                           ; No, skip.
599	addl2   r2,4(r4)                        ; Yes, compensate.
6001550$:
601; As documented above, we unfolded the following to save instructions
602;       tstl    r2                              ; Multiplier negative?
603;       bgeq    1560$                           ; No, skip.
604	addl2   r6,4(r4)                        ; Yes, compensate.
6051560$:  addl2   r0,(r4)+                        ; Accumulate.
606	adwc    r1,(r4)+                        ; R1 = High-end partial sum.
607	movl    (r4),r1                         ; R1 = Next hi-end partial sum.
608	adwc    r7,r1                           ; Add carry if needed.
609	clrl    r7                              ; Reset lookahead register.
610	adwc    #0,r7                           ; Save lookahead carry.
611	movl    -(r4),r0                        ; R0 = Next lo-end partial.
612	sobgtr  r5,1500$                        ; More units?
613	sobgtr  r10,200$                        ; Nope, go get next multiplier
614	ret
615
616	.sbttl  P_SETP          Set Precison.
617;+
618; **-P_SETP-Set Precision
619;
620; Functional Description:
621;
622; This procedure is invoked to set the operating precision of the package.
623;
624; Calling Sequence:
625;
626;       P_SETP  nbits
627;
628; Parameters:
629;
630;       nbits           rw*v    Number of bits in number.
631;
632; Implicit Outputs:
633;
634;       Precis          Set to the number of longwords required to implement
635;                       the requested precision.
636;       Addoff          This is used as an offset into the various tables
637;                       of adds, subtracts and rotates to implement the
638;                       operation to the requested precsion.
639;
640; Status Returns:
641;
642;       None.
643;
644; Side Effects:
645;
646;       If the maximum precision set in 32-bit units by the assembly
647;       parameter "max_unit_prec" is exceeded, a message to that effect will
648;       be displayed and the program will terminate with a fatal error.
649;-
650
651	.entry  p_setp,^m<>
652
653	movzwl  4(ap),r1                        ; R1 = No. of bits.
654	addl2   #31,r1                          ; Round up to next long word.
655	ashl    #-5,r1,r1                       ; R1 = No. of 32 bit words.
656	movl    r1,precis                       ; Save precision.
657
658.if not_defined novector
659
660	subl3   r1,#max_unit_prec,r0            ; R0 = Number of steps reqd.
661	blss    10$                             ; If > 0 then exit.
662	mull3   #addsiz,r0,addoff               ; Get add table offset.
663
664.iftf ; novector
665
666	ret
667
668.ift ; novector
669
67010$:                                            ; Table size exceeded!
671	movab   -80(sp),sp                      ; Output buffer.
672	pushab  (sp)                            ; Build descriptor
673	movzwl  #80,-(sp)
674	clrl    -(sp)                           ; Receive return length.
675	pushl   #max_unit_prec                  ; Compiled max table size.
676	pushl   r1                              ; Requested table size.
677	pushaq  8+4(sp)                         ; -> Output buffer descriptor.
678	pushaw  12(sp)                          ; -> Returned length.
679	pushaq  prectoobig                      ; -> FAO control string.
680	calls   #5,g^sys$fao                    ; Format output string.
681	movl    (sp)+,(sp)                      ; Set actual buffer size.
682	pushaq  (sp)                            ; -> Output buffer descr.
683	calls   #1,g^lib$put_output             ; Output message.
684	$exit_s -                               ; Exit with severe error.
685		code=#4
686
687.endc ; novector
688
689	.end
690