1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4/*
5 *
6 *  This PA-RISC 2.0 function computes the product of two unsigned integers,
7 *  and adds the result to a previously computed integer.  The multiplicand
8 *  is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in
9 *  memory in little-double-wordian order.  The multiplier is an unsigned
10 *  64-bit integer.  The previously computed integer to which the product is
11 *  added is located in the result ("res") area, and is assumed to be a
12 *  576-bit (72-byte, nine doubleword) unsigned integer, stored in memory
13 *  in little-double-wordian order.  This value normally will be the result
14 *  of a previously computed nine doubleword result.  It is not necessary
15 *  to pad the multiplicand with an additional 64-bit zero doubleword.
16 *
17 *  Multiplicand, multiplier, and addend ideally should be aligned at
18 *  16-byte boundaries for best performance.  The code will function
19 *  correctly for alignment at eight-byte boundaries which are not 16-byte
20 *  boundaries, but the execution may be slightly slower due to even/odd
21 *  bank conflicts on PA-RISC 8000 processors.
22 *
23 *  This function is designed to accept the same calling sequence as Bill
24 *  Ackerman's "maxpy_little" function.  The carry from the ninth doubleword
25 *  of the result is written to the tenth word of the result, as is done by
26 *  Bill Ackerman's function.  The final carry also is returned as an
27 *  integer, which may be ignored.  The function prototype may be either
28 *  of the following:
29 *
30 *      void multacc512( int l, chunk* m, const chunk* a, chunk* res );
31 *          or
32 *      int multacc512( int l, chunk* m, const chunk* a, chunk* res );
33 *
34 *  where:  "l" originally denoted vector lengths.  This parameter is
35 *      ignored.  This function always assumes a multiplicand length of
36 *      512 bits (eight doublewords), and addend and result lengths of
37 *      576 bits (nine doublewords).
38 *
39 *      "m" is a pointer to the doubleword multiplier, ideally aligned
40 *      on a 16-byte boundary.
41 *
42 *      "a" is a pointer to the eight-doubleword multiplicand, stored
43 *      in little-double-wordian order, and ideally aligned on a 16-byte
44 *      boundary.
45 *
46 *      "res" is a pointer to the nine doubleword addend, and to the
47 *      nine-doubleword product computed by this function.  The result
48 *      also is stored in little-double-wordian order, and ideally is
49 *      aligned on a 16-byte boundary. It is expected that the alignment
50 *      of the "res" area may alternate between even/odd doubleword
51 *      boundaries for successive calls for 512-bit x 512-bit
52 *      multiplications.
53 *
54 *  The code for this function has been scheduled to use the parallelism
55 *  of the PA-RISC 8000 series microprocessors as well as the author was
56 *  able.  Comments and/or suggestions for improvement are welcomed.
57 *
58 *  The code is "64-bit safe".  This means it may be called in either
59 *  the 32ILP context or the 64LP context.  All 64-bits of registers are
60 *  saved and restored.
61 *
62 *  This code is self-contained.  It requires no other header files in order
63 *  to compile and to be linkable on a PA-RISC 2.0 machine.  Symbolic
64 *  definitions for registers and stack offsets are included within this
65 *  one source file.
66 *
67 *  This is a leaf routine.  As such, minimal use is made of the stack area.
68 *  Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight
69 *  general registers, and 128 bytes are used to move intermediate products
70 *  from the floating-point registers to the general registers.  Stack
71 *  protocols assure proper alignment of these areas.
72 *
73 */
74
75
76/*  ====================================================================*/
77/*      symbolic definitions for PA-RISC registers      */
78/*      in the MIPS style, avoids lots of case shifts       */
79/*      assigments (except t4) preserve register number parity  */
80/*  ====================================================================*/
81
82#define zero    %r0         /* permanent zero */
83#define t5      %r1         /* temp register, altered by addil */
84
85#define rp      %r2         /* return pointer */
86
87#define s1      %r3         /* callee saves register*/
88#define s0      %r4         /* callee saves register*/
89#define s3      %r5         /* callee saves register*/
90#define s2      %r6         /* callee saves register*/
91#define s5      %r7         /* callee saves register*/
92#define s4      %r8         /* callee saves register*/
93#define s7      %r9         /* callee saves register*/
94#define s6      %r10        /* callee saves register*/
95
96#define t1      %r19        /* caller saves register*/
97#define t0      %r20        /* caller saves register*/
98#define t3      %r21        /* caller saves register*/
99#define t2      %r22        /* caller saves register*/
100
101#define a3      %r23        /* fourth argument register, high word */
102#define a2      %r24        /* third argument register, low word*/
103#define a1      %r25        /* second argument register, high word*/
104#define a0      %r26        /* first argument register, low word*/
105
106#define v0      %r28        /* high order return value*/
107#define v1      %r29        /* low order return value*/
108
109#define sp      %r30        /* stack pointer*/
110#define t4      %r31        /* temporary register   */
111
112#define fa0     %fr4        /* first argument register*/
113#define fa1     %fr5        /* second argument register*/
114#define fa2     %fr6        /* third argument register*/
115#define fa3     %fr7        /* fourth argument register*/
116
117#define fa0r    %fr4R       /* first argument register*/
118#define fa1r    %fr5R       /* second argument register*/
119#define fa2r    %fr6R       /* third argument register*/
120#define fa3r    %fr7R       /* fourth argument register*/
121
122#define ft0     %fr8        /* caller saves register*/
123#define ft1     %fr9        /* caller saves register*/
124#define ft2     %fr10       /* caller saves register*/
125#define ft3     %fr11       /* caller saves register*/
126
127#define ft0r    %fr8R       /* caller saves register*/
128#define ft1r    %fr9R       /* caller saves register*/
129#define ft2r    %fr10R      /* caller saves register*/
130#define ft3r    %fr11R      /* caller saves register*/
131
132#define ft4     %fr22       /* caller saves register*/
133#define ft5     %fr23       /* caller saves register*/
134#define ft6     %fr24       /* caller saves register*/
135#define ft7     %fr25       /* caller saves register*/
136#define ft8     %fr26       /* caller saves register*/
137#define ft9     %fr27       /* caller saves register*/
138#define ft10    %fr28       /* caller saves register*/
139#define ft11    %fr29       /* caller saves register*/
140#define ft12    %fr30       /* caller saves register*/
141#define ft13    %fr31       /* caller saves register*/
142
143#define ft4r    %fr22R      /* caller saves register*/
144#define ft5r    %fr23R      /* caller saves register*/
145#define ft6r    %fr24R      /* caller saves register*/
146#define ft7r    %fr25R      /* caller saves register*/
147#define ft8r    %fr26R      /* caller saves register*/
148#define ft9r    %fr27R      /* caller saves register*/
149#define ft10r   %fr28R      /* caller saves register*/
150#define ft11r   %fr29R      /* caller saves register*/
151#define ft12r   %fr30R      /* caller saves register*/
152#define ft13r   %fr31R      /* caller saves register*/
153
154
155
156/*  ================================================================== */
157/*      functional definitions for PA-RISC registers           */
158/*  ================================================================== */
159
160/*              general registers           */
161
162#define T1      a0          /* temp, (length parameter ignored)             */
163
164#define pM      a1          /* -> 64-bit multiplier                         */
165#define T2      a1          /* temp, (after fetching multiplier)            */
166
167#define pA      a2          /* -> multiplicand vector (8 64-bit words)      */
168#define T3      a2          /* temp, (after fetching multiplicand)          */
169
170#define pR      a3          /* -> addend vector (8 64-bit doublewords,
171                                  result vector (9 64-bit words)            */
172
173#define S0      s0          /* callee saves summand registers               */
174#define S1      s1
175#define S2      s2
176#define S3      s3
177#define S4      s4
178#define S5      s5
179#define S6      s6
180#define S7      s7
181
182#define S8      v0          /* caller saves summand registers               */
183#define S9      v1
184#define S10     t0
185#define S11     t1
186#define S12     t2
187#define S13     t3
188#define S14     t4
189#define S15     t5
190
191
192
193/*              floating-point registers                                    */
194
195#define M       fa0         /* multiplier double word                       */
196#define MR      fa0r        /* low order half of multiplier double word     */
197#define ML      fa0         /* high order half of multiplier double word    */
198
199#define A0      fa2         /* multiplicand double word 0                   */
200#define A0R     fa2r        /* low order half of multiplicand double word   */
201#define A0L     fa2         /* high order half of multiplicand double word  */
202
203#define A1      fa3         /* multiplicand double word 1                   */
204#define A1R     fa3r        /* low order half of multiplicand double word   */
205#define A1L     fa3         /* high order half of multiplicand double word  */
206
207#define A2      ft0         /* multiplicand double word 2                   */
208#define A2R     ft0r        /* low order half of multiplicand double word   */
209#define A2L     ft0         /* high order half of multiplicand double word  */
210
211#define A3      ft1         /* multiplicand double word 3                   */
212#define A3R     ft1r        /* low order half of multiplicand double word   */
213#define A3L     ft1         /* high order half of multiplicand double word  */
214
215#define A4      ft2         /* multiplicand double word 4                   */
216#define A4R     ft2r        /* low order half of multiplicand double word   */
217#define A4L     ft2         /* high order half of multiplicand double word  */
218
219#define A5      ft3         /* multiplicand double word 5                   */
220#define A5R     ft3r        /* low order half of multiplicand double word   */
221#define A5L     ft3         /* high order half of multiplicand double word  */
222
223#define A6      ft4         /* multiplicand double word 6                   */
224#define A6R     ft4r        /* low order half of multiplicand double word   */
225#define A6L     ft4         /* high order half of multiplicand double word  */
226
227#define A7      ft5         /* multiplicand double word 7                   */
228#define A7R     ft5r        /* low order half of multiplicand double word   */
229#define A7L     ft5         /* high order half of multiplicand double word  */
230
231#define P0      ft6         /* product word 0                               */
232#define P1      ft7         /* product word 0                               */
233#define P2      ft8         /* product word 0                               */
234#define P3      ft9         /* product word 0                               */
235#define P4      ft10        /* product word 0                               */
236#define P5      ft11        /* product word 0                               */
237#define P6      ft12        /* product word 0                               */
238#define P7      ft13        /* product word 0                               */
239
240
241
242
243/*  ======================================================================  */
244/*      symbolic definitions for HP-UX stack offsets                        */
245/*      symbolic definitions for memory NOPs                                */
246/*  ======================================================================  */
247
248#define ST_SZ       192         /* stack area total size                    */
249
250#define SV0         -192(sp)    /* general register save area               */
251#define SV1         -184(sp)
252#define SV2         -176(sp)
253#define SV3         -168(sp)
254#define SV4         -160(sp)
255#define SV5         -152(sp)
256#define SV6         -144(sp)
257#define SV7         -136(sp)
258
259#define XF0         -128(sp)    /* data transfer area                       */
260#define XF1         -120(sp)    /* for floating-pt to integer regs          */
261#define XF2         -112(sp)
262#define XF3         -104(sp)
263#define XF4         -96(sp)
264#define XF5         -88(sp)
265#define XF6         -80(sp)
266#define XF7         -72(sp)
267#define XF8         -64(sp)
268#define XF9         -56(sp)
269#define XF10        -48(sp)
270#define XF11        -40(sp)
271#define XF12        -32(sp)
272#define XF13        -24(sp)
273#define XF14        -16(sp)
274#define XF15        -8(sp)
275
276#define mnop    proberi (sp),3,zero     /* memory NOP                       */
277
278
279
280
281/*  ======================================================================  */
282/*      assembler formalities                                               */
283/*  ======================================================================  */
284
285#ifdef __LP64__
286                .level  2.0W
287#else
288                .level  2.0
289#endif
290                .space    $TEXT$
291                .subspa   $CODE$
292                .align    16
293
294/*  ======================================================================  */
295/*      here to compute 64-bit x 512-bit product + 512-bit addend           */
296/*  ======================================================================  */
297
298multacc512
299        .PROC
300        .CALLINFO
301        .ENTRY
302    fldd    0(pM),M                 ; multiplier double word
303    ldo     ST_SZ(sp),sp            ; push stack
304
305    fldd    0(pA),A0                ; multiplicand double word 0
306    std     S1,SV1                  ; save s1
307
308    fldd    16(pA),A2               ; multiplicand double word 2
309    std     S3,SV3                  ; save s3
310
311    fldd    32(pA),A4               ; multiplicand double word 4
312    std     S5,SV5                  ; save s5
313
314    fldd    48(pA),A6               ; multiplicand double word 6
315    std     S7,SV7                  ; save s7
316
317
318    std     S0,SV0                  ; save s0
319    fldd    8(pA),A1                ; multiplicand double word 1
320    xmpyu   MR,A0L,P0               ; A0 cross 32-bit word products
321    xmpyu   ML,A0R,P2
322
323    std     S2,SV2                  ; save s2
324    fldd    24(pA),A3               ; multiplicand double word 3
325    xmpyu   MR,A2L,P4               ; A2 cross 32-bit word products
326    xmpyu   ML,A2R,P6
327
328    std     S4,SV4                  ; save s4
329    fldd    40(pA),A5               ; multiplicand double word 5
330
331    std     S6,SV6                  ; save s6
332    fldd    56(pA),A7               ; multiplicand double word 7
333
334
335    fstd    P0,XF0                  ; MR * A0L
336    xmpyu   MR,A0R,P0               ; A0 right 32-bit word product
337    xmpyu   MR,A1L,P1               ; A1 cross 32-bit word product
338
339    fstd    P2,XF2                  ; ML * A0R
340    xmpyu   ML,A0L,P2               ; A0 left 32-bit word product
341    xmpyu   ML,A1R,P3               ; A1 cross 32-bit word product
342
343    fstd    P4,XF4                  ; MR * A2L
344    xmpyu   MR,A2R,P4               ; A2 right 32-bit word product
345    xmpyu   MR,A3L,P5               ; A3 cross 32-bit word product
346
347    fstd    P6,XF6                  ; ML * A2R
348    xmpyu   ML,A2L,P6               ; A2 parallel 32-bit word product
349    xmpyu   ML,A3R,P7               ; A3 cross 32-bit word product
350
351
352    ldd     XF0,S0                  ; MR * A0L
353    fstd    P1,XF1                  ; MR * A1L
354
355    ldd     XF2,S2                  ; ML * A0R
356    fstd    P3,XF3                  ; ML * A1R
357
358    ldd     XF4,S4                  ; MR * A2L
359    fstd    P5,XF5                  ; MR * A3L
360    xmpyu   MR,A1R,P1               ; A1 parallel 32-bit word products
361    xmpyu   ML,A1L,P3
362
363    ldd     XF6,S6                  ; ML * A2R
364    fstd    P7,XF7                  ; ML * A3R
365    xmpyu   MR,A3R,P5               ; A3 parallel 32-bit word products
366    xmpyu   ML,A3L,P7
367
368
369    fstd    P0,XF0                  ; MR * A0R
370    ldd     XF1,S1                  ; MR * A1L
371    nop
372    add     S0,S2,T1                ; A0 cross product sum
373
374    fstd    P2,XF2                  ; ML * A0L
375    ldd     XF3,S3                  ; ML * A1R
376    add,dc  zero,zero,S0            ; A0 cross product sum carry
377    depd,z  T1,31,32,S2             ; A0 cross product sum << 32
378
379    fstd    P4,XF4                  ; MR * A2R
380    ldd     XF5,S5                  ; MR * A3L
381    shrpd   S0,T1,32,S0             ; A0 carry | cross product sum >> 32
382    add     S4,S6,T3                ; A2 cross product sum
383
384    fstd    P6,XF6                  ; ML * A2L
385    ldd     XF7,S7                  ; ML * A3R
386    add,dc  zero,zero,S4            ; A2 cross product sum carry
387    depd,z  T3,31,32,S6             ; A2 cross product sum << 32
388
389
390    ldd     XF0,S8                  ; MR * A0R
391    fstd    P1,XF1                  ; MR * A1R
392    xmpyu   MR,A4L,P0               ; A4 cross 32-bit word product
393    xmpyu   MR,A5L,P1               ; A5 cross 32-bit word product
394
395    ldd     XF2,S10                 ; ML * A0L
396    fstd    P3,XF3                  ; ML * A1L
397    xmpyu   ML,A4R,P2               ; A4 cross 32-bit word product
398    xmpyu   ML,A5R,P3               ; A5 cross 32-bit word product
399
400    ldd     XF4,S12                 ; MR * A2R
401    fstd    P5,XF5                  ; MR * A3L
402    xmpyu   MR,A6L,P4               ; A6 cross 32-bit word product
403    xmpyu   MR,A7L,P5               ; A7 cross 32-bit word product
404
405    ldd     XF6,S14                 ; ML * A2L
406    fstd    P7,XF7                  ; ML * A3L
407    xmpyu   ML,A6R,P6               ; A6 cross 32-bit word product
408    xmpyu   ML,A7R,P7               ; A7 cross 32-bit word product
409
410
411    fstd    P0,XF0                  ; MR * A4L
412    ldd     XF1,S9                  ; MR * A1R
413    shrpd   S4,T3,32,S4             ; A2 carry | cross product sum >> 32
414    add     S1,S3,T1                ; A1 cross product sum
415
416    fstd    P2,XF2                  ; ML * A4R
417    ldd     XF3,S11                 ; ML * A1L
418    add,dc  zero,zero,S1            ; A1 cross product sum carry
419    depd,z  T1,31,32,S3             ; A1 cross product sum << 32
420
421    fstd    P4,XF4                  ; MR * A6L
422    ldd     XF5,S13                 ; MR * A3R
423    shrpd   S1,T1,32,S1             ; A1 carry | cross product sum >> 32
424    add     S5,S7,T3                ; A3 cross product sum
425
426    fstd    P6,XF6                  ; ML * A6R
427    ldd     XF7,S15                 ; ML * A3L
428    add,dc  zero,zero,S5            ; A3 cross product sum carry
429    depd,z  T3,31,32,S7             ; A3 cross product sum << 32
430
431
432    shrpd   S5,T3,32,S5             ; A3 carry | cross product sum >> 32
433    add     S2,S8,S8                ; M * A0 right doubleword, P0 doubleword
434
435    add,dc  S0,S10,S10              ; M * A0 left doubleword
436    add     S3,S9,S9                ; M * A1 right doubleword
437
438    add,dc  S1,S11,S11              ; M * A1 left doubleword
439    add     S6,S12,S12              ; M * A2 right doubleword
440
441
442    ldd     24(pR),S3               ; Addend word 3
443    fstd    P1,XF1                  ; MR * A5L
444    add,dc  S4,S14,S14              ; M * A2 left doubleword
445    xmpyu   MR,A5R,P1               ; A5 right 32-bit word product
446
447    ldd     8(pR),S1                ; Addend word 1
448    fstd    P3,XF3                  ; ML * A5R
449    add     S7,S13,S13              ; M * A3 right doubleword
450    xmpyu   ML,A5L,P3               ; A5 left 32-bit word product
451
452    ldd     0(pR),S7                ; Addend word 0
453    fstd    P5,XF5                  ; MR * A7L
454    add,dc  S5,S15,S15              ; M * A3 left doubleword
455    xmpyu   MR,A7R,P5               ; A7 right 32-bit word product
456
457    ldd     16(pR),S5               ; Addend word 2
458    fstd    P7,XF7                  ; ML * A7R
459    add     S10,S9,S9               ; P1 doubleword
460    xmpyu   ML,A7L,P7               ; A7 left 32-bit word products
461
462
463    ldd     XF0,S0                  ; MR * A4L
464    fstd    P1,XF9                  ; MR * A5R
465    add,dc  S11,S12,S12             ; P2 doubleword
466    xmpyu   MR,A4R,P0               ; A4 right 32-bit word product
467
468    ldd     XF2,S2                  ; ML * A4R
469    fstd    P3,XF11                 ; ML * A5L
470    add,dc  S14,S13,S13             ; P3 doubleword
471    xmpyu   ML,A4L,P2               ; A4 left 32-bit word product
472
473    ldd     XF6,S6                  ; ML * A6R
474    fstd    P5,XF13                 ; MR * A7R
475    add,dc  zero,S15,T2             ; P4 partial doubleword
476    xmpyu   MR,A6R,P4               ; A6 right 32-bit word product
477
478    ldd     XF4,S4                  ; MR * A6L
479    fstd    P7,XF15                 ; ML * A7L
480    add     S7,S8,S8                ; R0 + P0, new R0 doubleword
481    xmpyu   ML,A6L,P6               ; A6 left 32-bit word product
482
483
484    fstd    P0,XF0                  ; MR * A4R
485    ldd     XF7,S7                  ; ML * A7R
486    add,dc  S1,S9,S9                ; c + R1 + P1, new R1 doubleword
487
488    fstd    P2,XF2                  ; ML * A4L
489    ldd     XF1,S1                  ; MR * A5L
490    add,dc  S5,S12,S12              ; c + R2 + P2, new R2 doubleword
491
492    fstd    P4,XF4                  ; MR * A6R
493    ldd     XF5,S5                  ; MR * A7L
494    add,dc  S3,S13,S13              ; c + R3 + P3, new R3 doubleword
495
496    fstd    P6,XF6                  ; ML * A6L
497    ldd     XF3,S3                  ; ML * A5R
498    add,dc  zero,T2,T2              ; c + partial P4
499    add     S0,S2,T1                ; A4 cross product sum
500
501
502    std     S8,0(pR)                ; save R0
503    add,dc  zero,zero,S0            ; A4 cross product sum carry
504    depd,z  T1,31,32,S2             ; A4 cross product sum << 32
505
506    std     S9,8(pR)                ; save R1
507    shrpd   S0,T1,32,S0             ; A4 carry | cross product sum >> 32
508    add     S4,S6,T3                ; A6 cross product sum
509
510    std     S12,16(pR)              ; save R2
511    add,dc  zero,zero,S4            ; A6 cross product sum carry
512    depd,z  T3,31,32,S6             ; A6 cross product sum << 32
513
514
515    std     S13,24(pR)              ; save R3
516    shrpd   S4,T3,32,S4             ; A6 carry | cross product sum >> 32
517    add     S1,S3,T1                ; A5 cross product sum
518
519    ldd     XF0,S8                  ; MR * A4R
520    add,dc  zero,zero,S1            ; A5 cross product sum carry
521    depd,z  T1,31,32,S3             ; A5 cross product sum << 32
522
523    ldd     XF2,S10                 ; ML * A4L
524    ldd     XF9,S9                  ; MR * A5R
525    shrpd   S1,T1,32,S1             ; A5 carry | cross product sum >> 32
526    add     S5,S7,T3                ; A7 cross product sum
527
528    ldd     XF4,S12                 ; MR * A6R
529    ldd     XF11,S11                ; ML * A5L
530    add,dc  zero,zero,S5            ; A7 cross product sum carry
531    depd,z  T3,31,32,S7             ; A7 cross product sum << 32
532
533    ldd     XF6,S14                 ; ML * A6L
534    ldd     XF13,S13                ; MR * A7R
535    shrpd   S5,T3,32,S5             ; A7 carry | cross product sum >> 32
536    add     S2,S8,S8                ; M * A4 right doubleword
537
538
539    ldd     XF15,S15                ; ML * A7L
540    add,dc  S0,S10,S10              ; M * A4 left doubleword
541    add     S3,S9,S9                ; M * A5 right doubleword
542
543    add,dc  S1,S11,S11              ; M * A5 left doubleword
544    add     S6,S12,S12              ; M * A6 right doubleword
545
546    ldd     32(pR),S0               ; Addend word 4
547    ldd     40(pR),S1               ; Addend word 5
548    add,dc  S4,S14,S14              ; M * A6 left doubleword
549    add     S7,S13,S13              ; M * A7 right doubleword
550
551    ldd     48(pR),S2               ; Addend word 6
552    ldd     56(pR),S3               ; Addend word 7
553    add,dc  S5,S15,S15              ; M * A7 left doubleword
554    add     S8,T2,S8                ; P4 doubleword
555
556    ldd     64(pR),S4               ; Addend word 8
557    ldd     SV5,s5                  ; restore s5
558    add,dc  S10,S9,S9               ; P5 doubleword
559    add,dc  S11,S12,S12             ; P6 doubleword
560
561
562    ldd     SV6,s6                  ; restore s6
563    ldd     SV7,s7                  ; restore s7
564    add,dc  S14,S13,S13             ; P7 doubleword
565    add,dc  zero,S15,S15            ; P8 doubleword
566
567    add     S0,S8,S8                ; new R4 doubleword
568
569    ldd     SV0,s0                  ; restore s0
570    std     S8,32(pR)               ; save R4
571    add,dc  S1,S9,S9                ; new R5 doubleword
572
573    ldd     SV1,s1                  ; restore s1
574    std     S9,40(pR)               ; save R5
575    add,dc  S2,S12,S12              ; new R6 doubleword
576
577    ldd     SV2,s2                  ; restore s2
578    std     S12,48(pR)              ; save R6
579    add,dc  S3,S13,S13              ; new R7 doubleword
580
581    ldd     SV3,s3                  ; restore s3
582    std     S13,56(pR)              ; save R7
583    add,dc  S4,S15,S15              ; new R8 doubleword
584
585    ldd     SV4,s4                  ; restore s4
586    std     S15,64(pR)              ; save result[8]
587    add,dc  zero,zero,v0            ; return carry from R8
588
589    CMPIB,*= 0,v0,$L0               ; if no overflow, exit
590    LDO     8(pR),pR
591
592$FINAL1                             ; Final carry propagation
593    LDD     64(pR),v0
594    LDO     8(pR),pR
595    ADDI    1,v0,v0
596    CMPIB,*= 0,v0,$FINAL1           ; Keep looping if there is a carry.
597    STD     v0,56(pR)
598$L0
599    bv      zero(rp)                ; -> caller
600    ldo     -ST_SZ(sp),sp           ; pop stack
601
602/*  ======================================================================  */
603/*      end of module                                                       */
604/*  ======================================================================  */
605
606
607        bve (rp)
608        .EXIT
609        nop
610                .PROCEND
611                .SPACE         $TEXT$
612                .SUBSPA        $CODE$
613                .EXPORT        multacc512,ENTRY
614
615        .end
616