1; This Source Code Form is subject to the terms of the Mozilla Public
2; License, v. 2.0. If a copy of the MPL was not distributed with this
3; file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5#ifdef __LP64__
6        .LEVEL   2.0W
7#else
8;       .LEVEL   1.1
9;       .ALLOW   2.0N
10        .LEVEL   2.0
11#endif
12        .SPACE   $TEXT$,SORT=8
13        .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
14
15; ***************************************************************
16;
17;                 maxpy_[little/big]
18;
19; ***************************************************************
20
21; There is no default -- you must specify one or the other.
22#define LITTLE_WORDIAN 1
23
24#ifdef LITTLE_WORDIAN
25#define EIGHT 8
26#define SIXTEEN 16
27#define THIRTY_TWO 32
28#define UN_EIGHT -8
29#define UN_SIXTEEN -16
30#define UN_TWENTY_FOUR -24
31#endif
32
33#ifdef BIG_WORDIAN
34#define EIGHT -8
35#define SIXTEEN -16
36#define THIRTY_TWO -32
37#define UN_EIGHT 8
38#define UN_SIXTEEN 16
39#define UN_TWENTY_FOUR 24
40#endif
41
42; This performs a multiple-precision integer version of "daxpy",
43; Using the selected addressing direction.  "Little-wordian" means that
44; the least significant word of a number is stored at the lowest address.
45; "Big-wordian" means that the most significant word is at the lowest
46; address.  Either way, the incoming address of the vector is that
47; of the least significant word.  That means that, for little-wordian
48; addressing, we move the address upward as we propagate carries
49; from the least significant word to the most significant.  For
50; big-wordian we move the address downward.
51
52; We use the following registers:
53;
54;     r2   return PC, of course
55;     r26 = arg1 =  length
56;     r25 = arg2 =  address of scalar
57;     r24 = arg3 =  multiplicand vector
58;     r23 = arg4 =  result vector
59;
60;     fr9 = scalar loaded once only from r25
61
62; The cycle counts shown in the bodies below are simply the result of a
63; scheduling by hand.  The actual PCX-U hardware does it differently.
64; The intention is that the overall speed is the same.
65
66; The pipeline startup and shutdown code is constructed in the usual way,
67; by taking the loop bodies and removing unnecessary instructions.
68; We have left the comments describing cycle numbers in the code.
69; These are intended for reference when comparing with the main loop,
70; and have no particular relationship to actual cycle numbers.
71
72#ifdef LITTLE_WORDIAN
73maxpy_little
74#else
75maxpy_big
76#endif
77        .PROC
78        .CALLINFO FRAME=120,ENTRY_GR=4
79        .ENTRY
80        STW,MA  %r3,128(%sp)
81        STW     %r4,-124(%sp)
82
83        ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
84        FLDD    0(%r25),%fr9        ; fr9 = scalar
85
86; First startup
87
88        FLDD    0(%r24),%fr24       ; Cycle 1
89        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
90        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
91        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
92        CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
93        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
94        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
95        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
96        FSTD    %fr24,-96(%sp)
97        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
98        FSTD    %fr25,-80(%sp)
99        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
100        FSTD    %fr31,-64(%sp)
101        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
102        FSTD    %fr27,-48(%sp)
103
104; Second startup
105
106        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
107        FSTD    %fr30,-56(%sp)
108        FLDD    0(%r24),%fr24
109
110        FSTD    %fr26,-88(%sp)      ; Cycle 2
111
112        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
113        FSTD    %fr28,-104(%sp)
114
115        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
116        LDD     -96(%sp),%r3
117        FSTD    %fr29,-72(%sp)
118
119        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
120        LDD     -64(%sp),%r19
121        LDD     -80(%sp),%r21
122
123        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
124        LDD     -56(%sp),%r20
125        ADD     %r21,%r3,%r3
126
127        ADD,DC  %r20,%r19,%r19      ; Cycle 7
128        LDD     -88(%sp),%r4
129        SHRPD   %r3,%r0,32,%r21
130        LDD     -48(%sp),%r1
131
132        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
133        LDD     -104(%sp),%r31
134        ADD,DC  %r0,%r0,%r20
135        SHRPD   %r19,%r3,32,%r3
136
137        LDD     -72(%sp),%r29       ; Cycle 9
138        SHRPD   %r20,%r19,32,%r20
139        ADD     %r21,%r1,%r1
140
141        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
142        ADD,DC  %r3,%r4,%r4
143        FSTD    %fr24,-96(%sp)
144
145        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
146        ADD,DC  %r0,%r20,%r20
147        LDD     0(%r23),%r3
148        FSTD    %fr25,-80(%sp)
149
150        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
151        FSTD    %fr31,-64(%sp)
152
153        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
154        ADD     %r0,%r0,%r0         ; clear the carry bit
155        ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
156        FSTD    %fr27,-48(%sp)
157;        MFCTL   %cr16,%r21         ; for timing
158;        STD     %r21,-112(%sp)
159
160; Here is the loop.
161
162$LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
163        ADD,DC  %r29,%r4,%r4
164        FSTD    %fr30,-56(%sp)
165        FLDD    0(%r24),%fr24
166
167        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
168        ADD,DC  %r0,%r20,%r20
169        FSTD    %fr26,-88(%sp)
170
171        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
172        ADD     %r3,%r1,%r1
173        FSTD    %fr28,-104(%sp)
174        LDD     UN_EIGHT(%r23),%r21
175
176        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
177        ADD,DC  %r21,%r4,%r28
178        FSTD    %fr29,-72(%sp)
179        LDD     -96(%sp),%r3
180
181        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
182        ADD,DC  %r20,%r31,%r22
183        LDD     -64(%sp),%r19
184        LDD     -80(%sp),%r21
185
186        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
187        ADD     %r21,%r3,%r3
188        LDD     -56(%sp),%r20
189        STD     %r1,UN_SIXTEEN(%r23)
190
191        ADD,DC  %r20,%r19,%r19      ; Cycle 7
192        SHRPD   %r3,%r0,32,%r21
193        LDD     -88(%sp),%r4
194        LDD     -48(%sp),%r1
195
196        ADD,DC  %r0,%r0,%r20        ; Cycle 8
197        SHRPD   %r19,%r3,32,%r3
198        FLDD    EIGHT(%r24),%fr28
199        LDD     -104(%sp),%r31
200
201        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
202        ADD     %r21,%r1,%r1
203        STD     %r28,UN_EIGHT(%r23)
204        LDD     -72(%sp),%r29
205
206        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
207        ADD,DC  %r3,%r4,%r4
208        FSTD    %fr24,-96(%sp)
209
210        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
211        ADD,DC  %r0,%r20,%r20
212        FSTD    %fr25,-80(%sp)
213        LDD     0(%r23),%r3
214
215        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
216        FSTD    %fr31,-64(%sp)
217
218        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
219        ADD     %r22,%r1,%r1
220        ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
221        FSTD    %fr27,-48(%sp)
222
223$ENDLOOP
224
225; Shutdown code, first stage.
226
227;        MFCTL   %cr16,%r21         ; for timing
228;        STD     %r21,UN_SIXTEEN(%r23)
229;        LDD     -112(%sp),%r21
230;        STD     %r21,UN_EIGHT(%r23)
231
232        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
233        ADD,DC  %r29,%r4,%r4
234        CMPIB,= 0,%r26,$ONEMORE
235        FSTD    %fr30,-56(%sp)
236
237        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
238        ADD,DC  %r0,%r20,%r20
239        FSTD    %fr26,-88(%sp)
240
241        ADD     %r3,%r1,%r1         ; Cycle 3
242        FSTD    %fr28,-104(%sp)
243        LDD     UN_EIGHT(%r23),%r21
244
245        ADD,DC  %r21,%r4,%r28       ; Cycle 4
246        FSTD    %fr29,-72(%sp)
247        STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
248        LDD     -96(%sp),%r3
249
250        ADD,DC  %r20,%r31,%r22      ; Cycle 5
251        STD     %r1,UN_SIXTEEN(%r23)
252$JOIN4
253        LDD     -64(%sp),%r19
254        LDD     -80(%sp),%r21
255
256        ADD     %r21,%r3,%r3        ; Cycle 6
257        LDD     -56(%sp),%r20
258
259        ADD,DC  %r20,%r19,%r19      ; Cycle 7
260        SHRPD   %r3,%r0,32,%r21
261        LDD     -88(%sp),%r4
262        LDD     -48(%sp),%r1
263
264        ADD,DC  %r0,%r0,%r20        ; Cycle 8
265        SHRPD   %r19,%r3,32,%r3
266        LDD     -104(%sp),%r31
267
268        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
269        ADD     %r21,%r1,%r1
270        LDD     -72(%sp),%r29
271
272        ADD,DC  %r3,%r4,%r4         ; Cycle 10
273
274        ADD,DC  %r0,%r20,%r20       ; Cycle 11
275        LDD     0(%r23),%r3
276
277        ADD     %r22,%r1,%r1        ; Cycle 13
278
279; Shutdown code, second stage.
280
281        ADD,DC  %r29,%r4,%r4        ; Cycle 1
282
283        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
284        ADD,DC  %r0,%r20,%r20
285
286        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
287        ADD     %r3,%r1,%r1
288
289        ADD,DC  %r21,%r4,%r28       ; Cycle 4
290
291        ADD,DC  %r20,%r31,%r22      ; Cycle 5
292
293        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
294
295        STD     %r28,UN_EIGHT(%r23) ; Cycle 9
296
297        LDD     0(%r23),%r3         ; Cycle 11
298
299; Shutdown code, third stage.
300
301        LDO     SIXTEEN(%r23),%r23
302        ADD     %r3,%r22,%r1
303$JOIN1  ADD,DC  %r0,%r0,%r21
304        CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
305        STD     %r1,UN_SIXTEEN(%r23)
306
307; Final carry propagation
308
309$FINAL1 LDO     EIGHT(%r23),%r23
310        LDD     UN_SIXTEEN(%r23),%r21
311        ADDI    1,%r21,%r21
312        CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
313        STD     %r21,UN_SIXTEEN(%r23)
314        B       $L0
315        NOP
316
317; Here is the code that handles the difficult cases N=1, N=2, and N=3.
318; We do the usual trick -- branch out of the startup code at appropriate
319; points, and branch into the shutdown code.
320
321$N_IS_SMALL
322        CMPIB,= 0,%r26,$N_IS_ONE
323        FSTD    %fr24,-96(%sp)      ; Cycle 10
324        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
325        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
326        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
327        FSTD    %fr25,-80(%sp)
328        FSTD    %fr31,-64(%sp)      ; Cycle 12
329        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
330        FSTD    %fr27,-48(%sp)
331        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
332        CMPIB,= 2,%r26,$N_IS_THREE
333        FSTD    %fr30,-56(%sp)
334
335; N = 2
336        FSTD    %fr26,-88(%sp)      ; Cycle 2
337        FSTD    %fr28,-104(%sp)     ; Cycle 3
338        LDD     -96(%sp),%r3        ; Cycle 4
339        FSTD    %fr29,-72(%sp)
340        B       $JOIN4
341        ADD     %r0,%r0,%r22
342
343$N_IS_THREE
344        FLDD    SIXTEEN(%r24),%fr24
345        FSTD    %fr26,-88(%sp)      ; Cycle 2
346        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
347        FSTD    %fr28,-104(%sp)
348        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
349        LDD     -96(%sp),%r3
350        FSTD    %fr29,-72(%sp)
351        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
352        LDD     -64(%sp),%r19
353        LDD     -80(%sp),%r21
354        B       $JOIN3
355        ADD     %r0,%r0,%r22
356
357$N_IS_ONE
358        FSTD    %fr25,-80(%sp)
359        FSTD    %fr27,-48(%sp)
360        FSTD    %fr26,-88(%sp)      ; Cycle 2
361        B       $JOIN5
362        ADD     %r0,%r0,%r22
363
364; We came out of the unrolled loop with wrong parity.  Do one more
365; single cycle.  This is quite tricky, because of the way the
366; carry chains and SHRPD chains have been chopped up.
367
368$ONEMORE
369
370        FLDD    0(%r24),%fr24
371
372        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
373        ADD,DC  %r0,%r20,%r20
374        FSTD    %fr26,-88(%sp)
375
376        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
377        FSTD    %fr28,-104(%sp)
378        LDD     UN_EIGHT(%r23),%r21
379        ADD     %r3,%r1,%r1
380
381        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
382        ADD,DC  %r21,%r4,%r28
383        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
384        LDD     -96(%sp),%r3
385        FSTD    %fr29,-72(%sp)
386
387        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
388        ADD,DC  %r20,%r31,%r22
389        LDD     -64(%sp),%r19
390        LDD     -80(%sp),%r21
391
392        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
393$JOIN3
394        XMPYU   %fr9L,%fr24R,%fr24
395        LDD     -56(%sp),%r20
396        ADD     %r21,%r3,%r3
397
398        ADD,DC  %r20,%r19,%r19      ; Cycle 7
399        LDD     -88(%sp),%r4
400        SHRPD   %r3,%r0,32,%r21
401        LDD     -48(%sp),%r1
402
403        LDD     -104(%sp),%r31      ; Cycle 8
404        ADD,DC  %r0,%r0,%r20
405        SHRPD   %r19,%r3,32,%r3
406
407        LDD     -72(%sp),%r29       ; Cycle 9
408        SHRPD   %r20,%r19,32,%r20
409        ADD     %r21,%r1,%r1
410
411        ADD,DC  %r3,%r4,%r4         ; Cycle 10
412        FSTD    %fr24,-96(%sp)
413
414        ADD,DC  %r0,%r20,%r20       ; Cycle 11
415        LDD     0(%r23),%r3
416        FSTD    %fr25,-80(%sp)
417
418        ADD     %r22,%r1,%r1        ; Cycle 13
419        FSTD    %fr27,-48(%sp)
420
421; Shutdown code, stage 1-1/2.
422
423        ADD,DC  %r29,%r4,%r4        ; Cycle 1
424
425        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
426        ADD,DC  %r0,%r20,%r20
427        FSTD    %fr26,-88(%sp)
428
429        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
430        ADD     %r3,%r1,%r1
431
432        ADD,DC  %r21,%r4,%r28       ; Cycle 4
433        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
434
435        ADD,DC  %r20,%r31,%r22      ; Cycle 5
436        STD     %r1,UN_SIXTEEN(%r23)
437$JOIN5
438        LDD     -96(%sp),%r3        ; moved from cycle 4
439        LDD     -80(%sp),%r21
440        ADD     %r21,%r3,%r3        ; Cycle 6
441        ADD,DC  %r0,%r0,%r19        ; Cycle 7
442        LDD     -88(%sp),%r4
443        SHRPD   %r3,%r0,32,%r21
444        LDD     -48(%sp),%r1
445        SHRPD   %r19,%r3,32,%r3     ; Cycle 8
446        ADD     %r21,%r1,%r1        ; Cycle 9
447        ADD,DC  %r3,%r4,%r4         ; Cycle 10
448        LDD     0(%r23),%r3         ; Cycle 11
449        ADD     %r22,%r1,%r1        ; Cycle 13
450
451; Shutdown code, stage 2-1/2.
452
453        ADD,DC  %r0,%r4,%r4         ; Cycle 1
454        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
455        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
456        ADD     %r3,%r1,%r1
457        STD     %r1,UN_SIXTEEN(%r23)
458        ADD,DC  %r21,%r4,%r1
459        B       $JOIN1
460        LDO     EIGHT(%r23),%r23
461
462; exit
463
464$L0
465        LDW     -124(%sp),%r4
466        BVE     (%r2)
467        .EXIT
468        LDW,MB  -128(%sp),%r3
469
470        .PROCEND
471
472; ***************************************************************
473;
474;                 add_diag_[little/big]
475;
476; ***************************************************************
477
478; The arguments are as follows:
479;     r2   return PC, of course
480;     r26 = arg1 =  length
481;     r25 = arg2 =  vector to square
482;     r24 = arg3 =  result vector
483
484#ifdef LITTLE_WORDIAN
485add_diag_little
486#else
487add_diag_big
488#endif
489        .PROC
490        .CALLINFO FRAME=120,ENTRY_GR=4
491        .ENTRY
492        STW,MA  %r3,128(%sp)
493        STW     %r4,-124(%sp)
494
495        ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
496        NOP
497
498; Startup code
499
500        FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
501        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
502        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
503        XMPYU   %fr7L,%fr7L,%fr30
504        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
505        FSTD    %fr29,-88(%sp)
506        FSTD    %fr27,-72(%sp)      ; Cycle 7
507        CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
508        FSTD    %fr30,-96(%sp)
509        FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
510        LDD     -88(%sp),%r22       ; Cycle 3
511        LDD     -72(%sp),%r31       ; Cycle 4
512        XMPYU   %fr7R,%fr7R,%fr28
513        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
514        XMPYU   %fr7L,%fr7L,%fr31
515        LDD     -96(%sp),%r20       ; Cycle 6
516        FSTD    %fr28,-80(%sp)
517        ADD     %r0,%r0,%r0         ; clear the carry bit
518        ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
519        FSTD    %fr24,-64(%sp)
520
521; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
522
523$DIAGLOOP
524        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
525        LDO     SIXTEEN(%r25),%r25
526        LDD     0(%r24),%r1
527        FSTD    %fr31,-104(%sp)
528        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
529        ADD,DC  %r22,%r3,%r3
530        FLDD    UN_SIXTEEN(%r25),%fr7
531        ADD,DC  %r0,%r20,%r20       ; Cycle 3
532        ADD     %r1,%r3,%r3
533        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
534        LDD     -80(%sp),%r21
535        STD     %r3,0(%r24)
536        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
537        XMPYU   %fr7L,%fr7L,%fr30
538        LDD     -64(%sp),%r29
539        LDD     EIGHT(%r24),%r1
540        ADD,DC  %r4,%r20,%r20       ; Cycle 6
541        LDD     -104(%sp),%r19
542        FSTD    %fr29,-88(%sp)
543        ADD     %r20,%r1,%r1        ; Cycle 7
544        FSTD    %fr27,-72(%sp)
545        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
546        LDO     THIRTY_TWO(%r24),%r24
547        LDD     UN_SIXTEEN(%r24),%r28
548        FSTD    %fr30,-96(%sp)
549        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
550        ADD,DC  %r21,%r4,%r4
551        FLDD    UN_EIGHT(%r25),%fr7
552        STD     %r1,UN_TWENTY_FOUR(%r24)
553        ADD,DC  %r0,%r19,%r19       ; Cycle 3
554        ADD     %r28,%r4,%r4
555        XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
556        LDD     -88(%sp),%r22
557        STD     %r4,UN_SIXTEEN(%r24)
558        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
559        XMPYU   %fr7L,%fr7L,%fr31
560        LDD     -72(%sp),%r31
561        LDD     UN_EIGHT(%r24),%r28
562        ADD,DC  %r3,%r19,%r19       ; Cycle 6
563        LDD     -96(%sp),%r20
564        FSTD    %fr28,-80(%sp)
565        ADD     %r19,%r28,%r28      ; Cycle 7
566        FSTD    %fr24,-64(%sp)
567        ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
568        STD     %r28,UN_EIGHT(%r24)
569
570$ENDDIAGLOOP
571
572        ADD,DC  %r0,%r22,%r22
573        CMPIB,= 0,%r26,$ONEMOREDIAG
574        SHRPD   %r31,%r0,31,%r3
575
576; Shutdown code, first stage.
577
578        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
579        LDD     0(%r24),%r28
580        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
581        ADD     %r3,%r22,%r3
582        ADD,DC  %r0,%r20,%r20       ; Cycle 3
583        LDD     -80(%sp),%r21
584        ADD     %r3,%r28,%r3
585        LDD     -64(%sp),%r29       ; Cycle 4
586        STD     %r3,0(%r24)
587        LDD     EIGHT(%r24),%r1     ; Cycle 5
588        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
589        LDD     -104(%sp),%r19
590        ADD,DC  %r4,%r20,%r20
591        ADD     %r20,%r1,%r1        ; Cycle 7
592        ADD,DC  %r0,%r21,%r21       ; Cycle 8
593        STD     %r1,EIGHT(%r24)
594
595; Shutdown code, second stage.
596
597        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
598        LDO     THIRTY_TWO(%r24),%r24
599        LDD     UN_SIXTEEN(%r24),%r1
600        SHRPD   %r0,%r29,31,%r3      ; Cycle 2
601        ADD     %r4,%r21,%r4
602        ADD,DC  %r0,%r19,%r19       ; Cycle 3
603        ADD     %r4,%r1,%r4
604        STD     %r4,UN_SIXTEEN(%r24); Cycle 4
605        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
606        ADD,DC  %r3,%r19,%r19       ; Cycle 6
607        ADD     %r19,%r28,%r28      ; Cycle 7
608        ADD,DC  %r0,%r0,%r22        ; Cycle 8
609        CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
610        STD     %r28,UN_EIGHT(%r24)
611
612; Final carry propagation
613
614$FDIAG2
615        LDO     EIGHT(%r24),%r24
616        LDD     UN_EIGHT(%r24),%r26
617        ADDI    1,%r26,%r26
618        CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
619        STD     %r26,UN_EIGHT(%r24)
620
621        B   $Z0
622        NOP
623
624; Here is the code that handles the difficult case N=1.
625; We do the usual trick -- branch out of the startup code at appropriate
626; points, and branch into the shutdown code.
627
628$DIAG_N_IS_ONE
629
630        LDD     -88(%sp),%r22
631        LDD     -72(%sp),%r31
632        B       $JOINDIAG
633        LDD     -96(%sp),%r20
634
635; We came out of the unrolled loop with wrong parity.  Do one more
636; single cycle.  This is the "alternate body".  It will, of course,
637; give us opposite registers from the other case, so we need
638; completely different shutdown code.
639
640$ONEMOREDIAG
641        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
642        LDD     0(%r24),%r28
643        FLDD    0(%r25),%fr7        ; Cycle 2
644        SHRPD   %r0,%r31,31,%r4
645        ADD     %r3,%r22,%r3
646        ADD,DC  %r0,%r20,%r20       ; Cycle 3
647        LDD     -80(%sp),%r21
648        ADD     %r3,%r28,%r3
649        LDD     -64(%sp),%r29       ; Cycle 4
650        STD     %r3,0(%r24)
651        XMPYU   %fr7R,%fr7R,%fr29
652        LDD     EIGHT(%r24),%r1     ; Cycle 5
653        XMPYU   %fr7L,%fr7R,%fr27
654        XMPYU   %fr7L,%fr7L,%fr30
655        LDD     -104(%sp),%r19      ; Cycle 6
656        FSTD    %fr29,-88(%sp)
657        ADD,DC  %r4,%r20,%r20
658        FSTD    %fr27,-72(%sp)      ; Cycle 7
659        ADD     %r20,%r1,%r1
660        ADD,DC  %r0,%r21,%r21       ; Cycle 8
661        STD     %r1,EIGHT(%r24)
662
663; Shutdown code, first stage.
664
665        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
666        LDO     THIRTY_TWO(%r24),%r24
667        FSTD    %fr30,-96(%sp)
668        LDD     UN_SIXTEEN(%r24),%r1
669        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
670        ADD     %r4,%r21,%r4
671        ADD,DC  %r0,%r19,%r19       ; Cycle 3
672        LDD     -88(%sp),%r22
673        ADD     %r4,%r1,%r4
674        LDD     -72(%sp),%r31       ; Cycle 4
675        STD     %r4,UN_SIXTEEN(%r24)
676        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
677        LDD     -96(%sp),%r20       ; Cycle 6
678        ADD,DC  %r3,%r19,%r19
679        ADD     %r19,%r28,%r28      ; Cycle 7
680        ADD,DC  %r0,%r22,%r22       ; Cycle 8
681        STD     %r28,UN_EIGHT(%r24)
682
683; Shutdown code, second stage.
684
685$JOINDIAG
686        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
687        LDD     0(%r24),%r28
688        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
689        ADD     %r3,%r22,%r3
690        ADD,DC  %r0,%r20,%r20       ; Cycle 3
691        ADD     %r3,%r28,%r3
692        STD     %r3,0(%r24)         ; Cycle 4
693        LDD     EIGHT(%r24),%r1     ; Cycle 5
694        ADD,DC  %r4,%r20,%r20
695        ADD     %r20,%r1,%r1        ; Cycle 7
696        ADD,DC  %r0,%r0,%r21        ; Cycle 8
697        CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
698        STD     %r1,EIGHT(%r24)
699
700; Final carry propagation
701
702$FDIAG1
703        LDO     EIGHT(%r24),%r24
704        LDD     EIGHT(%r24),%r26
705        ADDI    1,%r26,%r26
706        CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
707        STD     %r26,EIGHT(%r24)
708
709$Z0
710        LDW     -124(%sp),%r4
711        BVE     (%r2)
712        .EXIT
713        LDW,MB  -128(%sp),%r3
714        .PROCEND
715;	.ALLOW
716
717        .SPACE         $TEXT$
718        .SUBSPA        $CODE$
719#ifdef LITTLE_WORDIAN
720#ifdef __GNUC__
721; GNU-as (as of 2.19) does not support LONG_RETURN
722        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
723        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
724#else
725        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
726        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
727#endif
728#else
729        .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
730        .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
731#endif
732        .END
733
734
735; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
736;
737; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
738; performs a 64-bit x any-size multiply, and adds the
739; result to an area of memory.  That is, it performs
740; something like
741;
742;      A B C D
743;    *       Z
744;   __________
745;    P Q R S T
746;
747; and then adds the "PQRST" vector into an area of memory,
748; handling all carries.
749;
750; Digression on nomenclature and endian-ness:
751;
752; Each of the capital letters in the above represents a 64-bit
753; quantity.  That is, you could think of the discussion as
754; being in terms of radix-16-quintillion arithmetic.  The data
755; type being manipulated is "unsigned long long int".  This
756; requires the 64-bit extension of the HP-UX C compiler,
757; available at release 10.  You need these compiler flags to
758; enable these extensions:
759;
760;       -Aa +e +DA2.0 +DS2.0
761;
762; (The first specifies ANSI C, the second enables the
763; extensions, which are beyond ANSI C, and the third and
764; fourth tell the compiler to use whatever features of the
765; PA2.0 architecture it wishes, in order to made the code more
766; efficient.  Since the presence of the assembly code will
767; make the program unable to run on anything less than PA2.0,
768; you might as well gain the performance enhancements in the C
769; code as well.)
770;
771; Questions of "endian-ness" often come up, usually in the
772; context of byte ordering in a word.  These routines have a
773; similar issue, that could be called "wordian-ness".
774; Independent of byte ordering (PA is always big-endian), one
775; can make two choices when representing extremely large
776; numbers as arrays of 64-bit doublewords in memory.
777;
778; "Little-wordian" layout means that the least significant
779; word of a number is stored at the lowest address.
780;
781;   MSW     LSW
782;    |       |
783;    V       V
784;
785;    A B C D E
786;
787;    ^     ^ ^
788;    |     | |____ address 0
789;    |     |
790;    |     |_______address 8
791;    |
792;    address 32
793;
794; "Big-wordian" means that the most significant word is at the
795; lowest address.
796;
797;   MSW     LSW
798;    |       |
799;    V       V
800;
801;    A B C D E
802;
803;    ^     ^ ^
804;    |     | |____ address 32
805;    |     |
806;    |     |_______address 24
807;    |
808;    address 0
809;
810; When you compile the file, you must specify one or the other, with
811; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
812;
813;     Incidentally, you assemble this file as part of your
814;     project with the same C compiler as the rest of the program.
815;     My "makefile" for a superprecision arithmetic package has
816;     the following stuff:
817;
818;     # definitions:
819;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
820;     CFLAGS = +O3
821;     LDFLAGS = -L /usr/lib -Wl,-aarchive
822;
823;     # general build rule for ".s" files:
824;     .s.o:
825;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
826;
827;     # Now any bind step that calls for pa20.o will assemble pa20.s
828;
829; End of digression, back to arithmetic:
830;
831; The way we multiply two huge numbers is, of course, to multiply
832; the "ABCD" vector by each of the "WXYZ" doublewords, adding
833; the result vectors with increasing offsets, the way we learned
834; in school, back before we all used calculators:
835;
836;            A B C D
837;          * W X Y Z
838;         __________
839;          P Q R S T
840;        E F G H I
841;      M N O P Q
842;  + R S T U V
843;    _______________
844;    F I N A L S U M
845;
846; So we call maxpy_PA20_big (in my case; my package is
847; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
848; in turn as the "scalar", and giving the "ABCD" vector each
849; time.  We direct it to add its result into an area of memory
850; that we have cleared at the start.  We skew the exact
851; location into that area with each call.
852;
853; The prototype for the function is
854;
855; extern void maxpy_PA20_big(
856;    int length,        /* Number of doublewords in the multiplicand vector. */
857;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
858;    const long long int *multiplicand,  /* The multiplicand vector. */
859;    long long int *result);             /* Where to accumulate the result. */
860;
861; (You should place a copy of this prototype in an include file
862; or in your C file.)
863;
864; Now, IN ALL CASES, the given address for the multiplicand or
865; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
866; That word is, of course, the word at which the routine
867; starts processing.  "maxpy_PA20_little" then increases the
868; addresses as it computes.  "maxpy_PA20_big" decreases them.
869;
870; In our example above, "length" would be 4 in each case.
871; "multiplicand" would be the "ABCD" vector.  Specifically,
872; the address of the element "D".  "scalaraddr" would be the
873; address of "W", "X", "Y", or "Z" on the four calls that we
874; would make.  (The order doesn't matter, of course.)
875; "result" would be the appropriate address in the result
876; area.  When multiplying by "Z", that would be the least
877; significant word.  When multiplying by "Y", it would be the
878; next higher word (8 bytes higher if little-wordian; 8 bytes
879; lower if big-wordian), and so on.  The size of the result
880; area must be the the sum of the sizes of the multiplicand
881; and multiplier vectors, and must be initialized to zero
882; before we start.
883;
884; Whenever the routine adds its partial product into the result
885; vector, it follows carry chains as far as they need to go.
886;
887; Here is the super-precision multiply routine that I use for
888; my package.  The package is big-wordian.  I have taken out
889; handling of exponents (it's a floating point package):
890;
891; static void mul_PA20(
892;   int size,
893;   const long long int *arg1,
894;   const long long int *arg2,
895;   long long int *result)
896; {
897;    int i;
898;
899;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
900;
901;    for (i=0 ; i<size ; i++) {
902;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
903;    }
904; }
905