1dnl ******************************************************************************
2dnl   Copyright 2009 Paul Zimmermann and Alexander Kruppa.
3dnl
4dnl   This file is part of the ECM Library.
5dnl
6dnl   The ECM Library is free software; you can redistribute it and/or modify
7dnl   it under the terms of the GNU Lesser General Public License as published by
8dnl   the Free Software Foundation; either version 3 of the License, or (at your
9dnl   option) any later version.
10dnl
11dnl   The ECM Library is distributed in the hope that it will be useful, but
12dnl   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13dnl   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14dnl   License for more details.
15dnl
16dnl   You should have received a copy of the GNU Lesser General Public License
17dnl   along with the ECM Library; see the file COPYING.LIB.  If not, write to
18dnl   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19dnl   MA 02110-1301, USA.
20dnl ******************************************************************************
21
22define(C, `
23dnl')
24
25C mp_limb_t mulredc8(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
26C                 const mp_limb_t *m, mp_limb_t inv_m);
27C
28C arguments:
29C r3 = ptr to result z least significant limb
30C r4 = ptr to input x least significant limb
31C r5 = ptr to input y least significant limb
32C r6 = ptr to modulus m least significant limb
33C r7 = -1/m mod 2^64
34C
35C final carry returned in r3
36
37
38
39include(`config.m4')
40
41	GLOBL GSYM_PREFIX`'mulredc8
42	GLOBL .GSYM_PREFIX`'mulredc8
43
44	.section ".opd", "aw"
45	.align	3
46GSYM_PREFIX`'mulredc8:
47	.quad	.GSYM_PREFIX`'mulredc8, .TOC.@tocbase, 0
48	.size	GSYM_PREFIX`'mulredc8, 24
49
50
51C Implements multiplication and REDC for two input numbers of 8 words
52
53C The algorithm:
54C   (Notation: a:b:c == a * 2^128 + b * 2^64 + c)
55C
56C T1:T0 = x[i]*y[0] ;
57C u = (T0*invm) % 2^64 ;
58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */
59C for (j = 1; j < len; j++)
60C   {
61C     cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ;
62C        /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */
63C     tmp[j-1] = T0;
64C   }
65C tmp[len-1] = T1 ;
66C tmp[len] = cy ; /* cy <= 1 (see note 2) */
67C for (i = 1; i < len; i++)
68C   {
69C     cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ;
70C     u = (T0*invm) % 2^64 ;
71C     cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */
72C     for (j = 1; j < len; j++)
73C       {
74C         cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ;
75C         /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3
76C            for j = (len-1), result cy:T1 <= 2*2^64 - 1  (see note 4) */
77C         tmp[j-1] = T0;
78C       }
79C     tmp[len-1] = T1 ;
80C     tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */
81C   }
82C z[0 ... len-1] = tmp[0 ... len-1] ;
83C return (tmp[len]) ;
84C
85C notes:
86C
87C 1:  m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2,
88C     so cy:T1 <= 2*2^64 - 4.
89C 2:  For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4
90C                 <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2),
91C     so cy:T1 <= 2*2^64 - 3. For j > 1,
92C     x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1),
93C     so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j.
94C 3:  m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
95C     so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4)
96C 4:  For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1
97C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64
98C                  <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2),
99C     so cy:T1 <= 3*2^64 - 3. For j > 1,
100C     x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1),
101C     so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1.
102C     For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0.
103C     Assume this is true for index i-1, Then
104C                x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1
105C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
106C                  <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1),
107C     so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction.
108C
109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11
110C                YP = r5, MP = r6, TP = r1 (stack ptr)
111C
112
113C local variables: tmp[0 ... 8] array, having 8+1 8-byte words
114C The tmp array needs 8+1 entries, but tmp[8] is stored in
115C r15, so only 8 entries are used in the stack.
116
117
118	TEXT
119	.align	5	C powerPC 32 byte alignment
120.GSYM_PREFIX`'mulredc8:
121
122C ########################################################################
123C # i = 0 pass
124C #########################################################################
125
126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u
127
128	ld      r12, 0(r4)		C XI = x[0]
129	ld      r0, 0(r5)		C y[0]
130	stdu    r13, -8(r1)		C save r13
131	mulld   r8, r0, r12		C x[0]*y[0] low half
132	stdu    r14, -8(r1)		C save r14
133	mulhdu  r9, r0, r12		C x[0]*y[0] high half
134	ld      r0, 0(r6)		C m[0]
135	mulld   r11, r7, r8		C U = T0*invm mod 2^64
136	stdu    r15, -8(r1)		C save r15
137	mulld   r13, r0, r11		C T0 = U*m[0] low
138	stdu    r16, -8(r1)		C save r16
139	li      r16, 0			C set r16 to zero for carry propagation
140	subi    r1, r1, 64		C set tmp stack space
141	mulhdu  r14, r0, r11		C T1 = U*m[0] high
142	ld      r0, 8(r5)		C y[1]
143	addc    r8, r8, r13		C
144	adde    r13, r9, r14		C T0 = initial tmp(0)
145	addze   r10, r16		C carry to CY
146	C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
147	C CY:T1 <= 2*2^64 - 4
148
149C Pass for j = 1
150
151	mulld   r8, r0, r12		C x[i]*y[j] low half
152	mulhdu  r9, r0, r12		C x[i]*y[j] high half
153	ld      r0, 8(r6)		C m[j]
154	addc    r13, r8, r13		C add low word to T0
155	adde    r14, r9, r10		C add high word with carry + CY to T1
156	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
157
158	mulld   r8, r0, r11		C U*m[j] low
159	mulhdu  r9, r0, r11		C U*m[j] high
160	addc    r8, r8, r13		C add T0 and low word
161	ld      r0, 16(r5)		C y[j+1]
162	adde    r13, r9, r14		C add high word with carry to T1
163	addze   r10, r16		C carry to CY
164	std     r8, 0(r1)		C store tmp[j-1]
165	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
166	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
167
168C Pass for j = 2
169
170	mulld   r8, r0, r12		C x[i]*y[j] low half
171	mulhdu  r9, r0, r12		C x[i]*y[j] high half
172	ld      r0, 16(r6)		C m[j]
173	addc    r13, r8, r13		C add low word to T0
174	adde    r14, r9, r10		C add high word with carry + CY to T1
175	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
176
177	mulld   r8, r0, r11		C U*m[j] low
178	mulhdu  r9, r0, r11		C U*m[j] high
179	addc    r8, r8, r13		C add T0 and low word
180	ld      r0, 24(r5)		C y[j+1]
181	adde    r13, r9, r14		C add high word with carry to T1
182	addze   r10, r16		C carry to CY
183	std     r8, 8(r1)		C store tmp[j-1]
184	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
185	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
186
187C Pass for j = 3
188
189	mulld   r8, r0, r12		C x[i]*y[j] low half
190	mulhdu  r9, r0, r12		C x[i]*y[j] high half
191	ld      r0, 24(r6)		C m[j]
192	addc    r13, r8, r13		C add low word to T0
193	adde    r14, r9, r10		C add high word with carry + CY to T1
194	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
195
196	mulld   r8, r0, r11		C U*m[j] low
197	mulhdu  r9, r0, r11		C U*m[j] high
198	addc    r8, r8, r13		C add T0 and low word
199	ld      r0, 32(r5)		C y[j+1]
200	adde    r13, r9, r14		C add high word with carry to T1
201	addze   r10, r16		C carry to CY
202	std     r8, 16(r1)		C store tmp[j-1]
203	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
204	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
205
206C Pass for j = 4
207
208	mulld   r8, r0, r12		C x[i]*y[j] low half
209	mulhdu  r9, r0, r12		C x[i]*y[j] high half
210	ld      r0, 32(r6)		C m[j]
211	addc    r13, r8, r13		C add low word to T0
212	adde    r14, r9, r10		C add high word with carry + CY to T1
213	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
214
215	mulld   r8, r0, r11		C U*m[j] low
216	mulhdu  r9, r0, r11		C U*m[j] high
217	addc    r8, r8, r13		C add T0 and low word
218	ld      r0, 40(r5)		C y[j+1]
219	adde    r13, r9, r14		C add high word with carry to T1
220	addze   r10, r16		C carry to CY
221	std     r8, 24(r1)		C store tmp[j-1]
222	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
223	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
224
225C Pass for j = 5
226
227	mulld   r8, r0, r12		C x[i]*y[j] low half
228	mulhdu  r9, r0, r12		C x[i]*y[j] high half
229	ld      r0, 40(r6)		C m[j]
230	addc    r13, r8, r13		C add low word to T0
231	adde    r14, r9, r10		C add high word with carry + CY to T1
232	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
233
234	mulld   r8, r0, r11		C U*m[j] low
235	mulhdu  r9, r0, r11		C U*m[j] high
236	addc    r8, r8, r13		C add T0 and low word
237	ld      r0, 48(r5)		C y[j+1]
238	adde    r13, r9, r14		C add high word with carry to T1
239	addze   r10, r16		C carry to CY
240	std     r8, 32(r1)		C store tmp[j-1]
241	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
242	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
243
244C Pass for j = 6
245
246	mulld   r8, r0, r12		C x[i]*y[j] low half
247	mulhdu  r9, r0, r12		C x[i]*y[j] high half
248	ld      r0, 48(r6)		C m[j]
249	addc    r13, r8, r13		C add low word to T0
250	adde    r14, r9, r10		C add high word with carry + CY to T1
251	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
252
253	mulld   r8, r0, r11		C U*m[j] low
254	mulhdu  r9, r0, r11		C U*m[j] high
255	addc    r8, r8, r13		C add T0 and low word
256	ld      r0, 56(r5)		C y[j+1]
257	adde    r13, r9, r14		C add high word with carry to T1
258	addze   r10, r16		C carry to CY
259	std     r8, 40(r1)		C store tmp[j-1]
260	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
261	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
262
263C Pass for j = 7. Don't fetch new data from y[j+1].
264
265	mulld   r8, r0, r12		C x[i]*y[j] low half
266	mulhdu  r9, r0, r12		C x[i]*y[j] high half
267	ld      r0, 56(r6)		C m[j]
268	addc    r13, r8, r13		C add low word to T0
269	adde    r14, r9, r10		C add high word with carry + CY to T1
270	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
271
272	mulld   r8, r0, r11		C U*m[j] low
273	mulhdu  r9, r0, r11		C U*m[j] high
274	addc    r8, r8, r13		C add T0 and low word
275	adde    r13, r9, r14		C add high word with carry to T1
276	std     r8, 48(r1)		C store tmp[len-2]
277	addze   r15, r16		C put carry in r15 (tmp[len] <= 1)
278	std     r13, 56(r1)		C store tmp[len-1]
279
280
281C #########################################################################
282C # i > 0 passes
283C #########################################################################
284
285
286	li      r9, 7			C outer loop count
287	mtctr   r9
288
2891:
290
291C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory
292C and compute the new u
293
294	ldu     r12, 8(r4)		C x[i]
295	ld      r0, 0(r5)		C y[0]
296	ld      r13, 0(r1)		C tmp[0]
297	mulld   r8, r0, r12		C x[i]*y[0] low half
298	ld      r14, 8(r1)		C tmp[1]
299	mulhdu  r9, r0, r12		C x[i]*y[0] high half
300	addc    r13, r8, r13		C T0
301	ld      r0, 0(r6)		C m[0]
302	mulld   r11, r7, r13		C U = T0*invm mod 2^64
303	adde    r14, r9, r14		C T1
304	mulld   r8, r0, r11		C U*m[0] low
305	addze   r10, r16		C CY
306	mulhdu  r9, r0, r11		C U*m[0] high
307	ld      r0, 8(r5)		C y[1]
308	addc    r8, r8, r13		C result = 0
309	adde    r13, r9, r14		C T0, carry pending
310	C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
311	C so cy:T1 <= 3*2^64 - 4
312
313C Pass for j = 1
314
315	ld      r14, 16(r1)		C tmp[j+1]
316	mulld   r8, r0, r12		C x[i]*y[j] low half
317	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
318	addze   r10, r16		C carry to CY
319	mulhdu  r9, r0, r12		C x[i]*y[j] high half
320	ld      r0, 8(r6)		C m[j]
321	addc    r13, r8, r13		C add low word to T0
322	mulld   r8, r0, r11		C U*m[j] low
323	adde    r14, r9, r14		C add high to T1
324	addze   r10, r10		C add carry to CY
325	mulhdu  r9, r0, r11		C U*m[j] high
326	addc    r8, r8, r13		C add T0 and low word
327	ld      r0, 16(r5)		C y[j+1]
328	adde    r13, r9, r14		C T1, carry pending
329	std     r8, 0(r1)		C store tmp[j-1]
330	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
331	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
332
333C Pass for j = 2
334
335	ld      r14, 24(r1)		C tmp[j+1]
336	mulld   r8, r0, r12		C x[i]*y[j] low half
337	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
338	addze   r10, r16		C carry to CY
339	mulhdu  r9, r0, r12		C x[i]*y[j] high half
340	ld      r0, 16(r6)		C m[j]
341	addc    r13, r8, r13		C add low word to T0
342	mulld   r8, r0, r11		C U*m[j] low
343	adde    r14, r9, r14		C add high to T1
344	addze   r10, r10		C add carry to CY
345	mulhdu  r9, r0, r11		C U*m[j] high
346	addc    r8, r8, r13		C add T0 and low word
347	ld      r0, 24(r5)		C y[j+1]
348	adde    r13, r9, r14		C T1, carry pending
349	std     r8, 8(r1)		C store tmp[j-1]
350	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
351	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
352
353C Pass for j = 3
354
355	ld      r14, 32(r1)		C tmp[j+1]
356	mulld   r8, r0, r12		C x[i]*y[j] low half
357	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
358	addze   r10, r16		C carry to CY
359	mulhdu  r9, r0, r12		C x[i]*y[j] high half
360	ld      r0, 24(r6)		C m[j]
361	addc    r13, r8, r13		C add low word to T0
362	mulld   r8, r0, r11		C U*m[j] low
363	adde    r14, r9, r14		C add high to T1
364	addze   r10, r10		C add carry to CY
365	mulhdu  r9, r0, r11		C U*m[j] high
366	addc    r8, r8, r13		C add T0 and low word
367	ld      r0, 32(r5)		C y[j+1]
368	adde    r13, r9, r14		C T1, carry pending
369	std     r8, 16(r1)		C store tmp[j-1]
370	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
371	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
372
373C Pass for j = 4
374
375	ld      r14, 40(r1)		C tmp[j+1]
376	mulld   r8, r0, r12		C x[i]*y[j] low half
377	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
378	addze   r10, r16		C carry to CY
379	mulhdu  r9, r0, r12		C x[i]*y[j] high half
380	ld      r0, 32(r6)		C m[j]
381	addc    r13, r8, r13		C add low word to T0
382	mulld   r8, r0, r11		C U*m[j] low
383	adde    r14, r9, r14		C add high to T1
384	addze   r10, r10		C add carry to CY
385	mulhdu  r9, r0, r11		C U*m[j] high
386	addc    r8, r8, r13		C add T0 and low word
387	ld      r0, 40(r5)		C y[j+1]
388	adde    r13, r9, r14		C T1, carry pending
389	std     r8, 24(r1)		C store tmp[j-1]
390	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
391	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
392
393C Pass for j = 5
394
395	ld      r14, 48(r1)		C tmp[j+1]
396	mulld   r8, r0, r12		C x[i]*y[j] low half
397	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
398	addze   r10, r16		C carry to CY
399	mulhdu  r9, r0, r12		C x[i]*y[j] high half
400	ld      r0, 40(r6)		C m[j]
401	addc    r13, r8, r13		C add low word to T0
402	mulld   r8, r0, r11		C U*m[j] low
403	adde    r14, r9, r14		C add high to T1
404	addze   r10, r10		C add carry to CY
405	mulhdu  r9, r0, r11		C U*m[j] high
406	addc    r8, r8, r13		C add T0 and low word
407	ld      r0, 48(r5)		C y[j+1]
408	adde    r13, r9, r14		C T1, carry pending
409	std     r8, 32(r1)		C store tmp[j-1]
410	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
411	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
412
413C Pass for j = 6
414
415	ld      r14, 56(r1)		C tmp[j+1]
416	mulld   r8, r0, r12		C x[i]*y[j] low half
417	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
418	addze   r10, r16		C carry to CY
419	mulhdu  r9, r0, r12		C x[i]*y[j] high half
420	ld      r0, 48(r6)		C m[j]
421	addc    r13, r8, r13		C add low word to T0
422	mulld   r8, r0, r11		C U*m[j] low
423	adde    r14, r9, r14		C add high to T1
424	addze   r10, r10		C add carry to CY
425	mulhdu  r9, r0, r11		C U*m[j] high
426	addc    r8, r8, r13		C add T0 and low word
427	ld      r0, 56(r5)		C y[j+1]
428	adde    r13, r9, r14		C T1, carry pending
429	std     r8, 40(r1)		C store tmp[j-1]
430	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
431	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
432
433C Pass for j = 7. Don't fetch new data from y[j+1].
434
435	mulld   r8, r0, r12		C x[i]*y[j] low half
436	adde    r14, r15, r10		C T1 = tmp[len] + CY + pending carry
437	C since tmp[len] <= 1, T1 <= 3 and carry is zero
438	mulhdu  r9, r0, r12		C x[i]*y[j] high half
439	ld      r0, 56(r6)		C m[j]
440	addc    r13, r8, r13		C add low word to T0
441	mulld   r8, r0, r11		C U*m[j] low
442	adde    r14, r9, r14		C add high to T1
443	addze   r10, r16		C CY
444	mulhdu  r9, r0, r11		C U*m[j] high
445	addc    r8, r8, r13		C add T0 and low word
446	adde    r13, r9, r14		C T1, carry pending
447	std     r8, 48(r1)		C store tmp[len-2]
448	addze   r15, r10		C store tmp[len] <= 1
449	std     r13, 56(r1)		C store tmp[len-1]
450	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
451	C          <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1)
452
453	bdnz 1b
454
455C Copy result from tmp memory to z
456
457	ld      r8, 0(r1)
458	ldu     r9, 8(r1)
459	std     r8, 0(r3)
460	stdu    r9, 8(r3)
461	ldu     r8, 8(r1)
462	ldu     r9, 8(r1)
463	stdu    r8, 8(r3)
464	stdu    r9, 8(r3)
465	ldu     r8, 8(r1)
466	ldu     r9, 8(r1)
467	stdu    r8, 8(r3)
468	stdu    r9, 8(r3)
469	ldu     r8, 8(r1)
470	ldu     r9, 8(r1)
471	stdu    r8, 8(r3)
472	stdu    r9, 8(r3)
473
474	mr      r3, r15         C return tmp(len)
475	ldu     r16, 8(r1)
476	ldu     r15, 8(r1)
477	ldu     r14, 8(r1)
478	ldu     r13, 8(r1)
479	addi    r1, r1, 8
480	blr
481
482	.size	.GSYM_PREFIX`'mulredc8, .-.GSYM_PREFIX`'mulredc8
483
484