1dnl ******************************************************************************
2dnl   Copyright 2009 Paul Zimmermann and Alexander Kruppa.
3dnl
4dnl   This file is part of the ECM Library.
5dnl
6dnl   The ECM Library is free software; you can redistribute it and/or modify
7dnl   it under the terms of the GNU Lesser General Public License as published by
8dnl   the Free Software Foundation; either version 3 of the License, or (at your
9dnl   option) any later version.
10dnl
11dnl   The ECM Library is distributed in the hope that it will be useful, but
12dnl   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13dnl   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14dnl   License for more details.
15dnl
16dnl   You should have received a copy of the GNU Lesser General Public License
17dnl   along with the ECM Library; see the file COPYING.LIB.  If not, write to
18dnl   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19dnl   MA 02110-1301, USA.
20dnl ******************************************************************************
21
22define(C, `
23dnl')
24
25C mp_limb_t mulredc15(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
26C                 const mp_limb_t *m, mp_limb_t inv_m);
27C
28C arguments:
29C r3 = ptr to result z least significant limb
30C r4 = ptr to input x least significant limb
31C r5 = ptr to input y least significant limb
32C r6 = ptr to modulus m least significant limb
33C r7 = -1/m mod 2^64
34C
35C final carry returned in r3
36
37
38
39include(`config.m4')
40
41	GLOBL GSYM_PREFIX`'mulredc15
42	GLOBL .GSYM_PREFIX`'mulredc15
43
44	.section ".opd", "aw"
45	.align	3
46GSYM_PREFIX`'mulredc15:
47	.quad	.GSYM_PREFIX`'mulredc15, .TOC.@tocbase, 0
48	.size	GSYM_PREFIX`'mulredc15, 24
49
50
51C Implements multiplication and REDC for two input numbers of 15 words
52
53C The algorithm:
54C   (Notation: a:b:c == a * 2^128 + b * 2^64 + c)
55C
56C T1:T0 = x[i]*y[0] ;
57C u = (T0*invm) % 2^64 ;
58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */
59C for (j = 1; j < len; j++)
60C   {
61C     cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ;
62C        /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */
63C     tmp[j-1] = T0;
64C   }
65C tmp[len-1] = T1 ;
66C tmp[len] = cy ; /* cy <= 1 (see note 2) */
67C for (i = 1; i < len; i++)
68C   {
69C     cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ;
70C     u = (T0*invm) % 2^64 ;
71C     cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */
72C     for (j = 1; j < len; j++)
73C       {
74C         cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ;
75C         /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3
76C            for j = (len-1), result cy:T1 <= 2*2^64 - 1  (see note 4) */
77C         tmp[j-1] = T0;
78C       }
79C     tmp[len-1] = T1 ;
80C     tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */
81C   }
82C z[0 ... len-1] = tmp[0 ... len-1] ;
83C return (tmp[len]) ;
84C
85C notes:
86C
87C 1:  m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2,
88C     so cy:T1 <= 2*2^64 - 4.
89C 2:  For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4
90C                 <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2),
91C     so cy:T1 <= 2*2^64 - 3. For j > 1,
92C     x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1),
93C     so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j.
94C 3:  m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
95C     so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4)
96C 4:  For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1
97C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64
98C                  <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2),
99C     so cy:T1 <= 3*2^64 - 3. For j > 1,
100C     x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1),
101C     so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1.
102C     For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0.
103C     Assume this is true for index i-1, Then
104C                x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1
105C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
106C                  <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1),
107C     so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction.
108C
109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11
110C                YP = r5, MP = r6, TP = r1 (stack ptr)
111C
112
113C local variables: tmp[0 ... 15] array, having 15+1 8-byte words
114C The tmp array needs 15+1 entries, but tmp[15] is stored in
115C r15, so only 15 entries are used in the stack.
116
117
118	TEXT
119	.align	5	C powerPC 32 byte alignment
120.GSYM_PREFIX`'mulredc15:
121
122C ########################################################################
123C # i = 0 pass
124C #########################################################################
125
126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u
127
128	ld      r12, 0(r4)		C XI = x[0]
129	ld      r0, 0(r5)		C y[0]
130	stdu    r13, -8(r1)		C save r13
131	mulld   r8, r0, r12		C x[0]*y[0] low half
132	stdu    r14, -8(r1)		C save r14
133	mulhdu  r9, r0, r12		C x[0]*y[0] high half
134	ld      r0, 0(r6)		C m[0]
135	mulld   r11, r7, r8		C U = T0*invm mod 2^64
136	stdu    r15, -8(r1)		C save r15
137	mulld   r13, r0, r11		C T0 = U*m[0] low
138	stdu    r16, -8(r1)		C save r16
139	li      r16, 0			C set r16 to zero for carry propagation
140	subi    r1, r1, 120		C set tmp stack space
141	mulhdu  r14, r0, r11		C T1 = U*m[0] high
142	ld      r0, 8(r5)		C y[1]
143	addc    r8, r8, r13		C
144	adde    r13, r9, r14		C T0 = initial tmp(0)
145	addze   r10, r16		C carry to CY
146	C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
147	C CY:T1 <= 2*2^64 - 4
148
149C Pass for j = 1
150
151	mulld   r8, r0, r12		C x[i]*y[j] low half
152	mulhdu  r9, r0, r12		C x[i]*y[j] high half
153	ld      r0, 8(r6)		C m[j]
154	addc    r13, r8, r13		C add low word to T0
155	adde    r14, r9, r10		C add high word with carry + CY to T1
156	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
157
158	mulld   r8, r0, r11		C U*m[j] low
159	mulhdu  r9, r0, r11		C U*m[j] high
160	addc    r8, r8, r13		C add T0 and low word
161	ld      r0, 16(r5)		C y[j+1]
162	adde    r13, r9, r14		C add high word with carry to T1
163	addze   r10, r16		C carry to CY
164	std     r8, 0(r1)		C store tmp[j-1]
165	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
166	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
167
168C Pass for j = 2
169
170	mulld   r8, r0, r12		C x[i]*y[j] low half
171	mulhdu  r9, r0, r12		C x[i]*y[j] high half
172	ld      r0, 16(r6)		C m[j]
173	addc    r13, r8, r13		C add low word to T0
174	adde    r14, r9, r10		C add high word with carry + CY to T1
175	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
176
177	mulld   r8, r0, r11		C U*m[j] low
178	mulhdu  r9, r0, r11		C U*m[j] high
179	addc    r8, r8, r13		C add T0 and low word
180	ld      r0, 24(r5)		C y[j+1]
181	adde    r13, r9, r14		C add high word with carry to T1
182	addze   r10, r16		C carry to CY
183	std     r8, 8(r1)		C store tmp[j-1]
184	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
185	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
186
187C Pass for j = 3
188
189	mulld   r8, r0, r12		C x[i]*y[j] low half
190	mulhdu  r9, r0, r12		C x[i]*y[j] high half
191	ld      r0, 24(r6)		C m[j]
192	addc    r13, r8, r13		C add low word to T0
193	adde    r14, r9, r10		C add high word with carry + CY to T1
194	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
195
196	mulld   r8, r0, r11		C U*m[j] low
197	mulhdu  r9, r0, r11		C U*m[j] high
198	addc    r8, r8, r13		C add T0 and low word
199	ld      r0, 32(r5)		C y[j+1]
200	adde    r13, r9, r14		C add high word with carry to T1
201	addze   r10, r16		C carry to CY
202	std     r8, 16(r1)		C store tmp[j-1]
203	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
204	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
205
206C Pass for j = 4
207
208	mulld   r8, r0, r12		C x[i]*y[j] low half
209	mulhdu  r9, r0, r12		C x[i]*y[j] high half
210	ld      r0, 32(r6)		C m[j]
211	addc    r13, r8, r13		C add low word to T0
212	adde    r14, r9, r10		C add high word with carry + CY to T1
213	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
214
215	mulld   r8, r0, r11		C U*m[j] low
216	mulhdu  r9, r0, r11		C U*m[j] high
217	addc    r8, r8, r13		C add T0 and low word
218	ld      r0, 40(r5)		C y[j+1]
219	adde    r13, r9, r14		C add high word with carry to T1
220	addze   r10, r16		C carry to CY
221	std     r8, 24(r1)		C store tmp[j-1]
222	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
223	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
224
225C Pass for j = 5
226
227	mulld   r8, r0, r12		C x[i]*y[j] low half
228	mulhdu  r9, r0, r12		C x[i]*y[j] high half
229	ld      r0, 40(r6)		C m[j]
230	addc    r13, r8, r13		C add low word to T0
231	adde    r14, r9, r10		C add high word with carry + CY to T1
232	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
233
234	mulld   r8, r0, r11		C U*m[j] low
235	mulhdu  r9, r0, r11		C U*m[j] high
236	addc    r8, r8, r13		C add T0 and low word
237	ld      r0, 48(r5)		C y[j+1]
238	adde    r13, r9, r14		C add high word with carry to T1
239	addze   r10, r16		C carry to CY
240	std     r8, 32(r1)		C store tmp[j-1]
241	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
242	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
243
244C Pass for j = 6
245
246	mulld   r8, r0, r12		C x[i]*y[j] low half
247	mulhdu  r9, r0, r12		C x[i]*y[j] high half
248	ld      r0, 48(r6)		C m[j]
249	addc    r13, r8, r13		C add low word to T0
250	adde    r14, r9, r10		C add high word with carry + CY to T1
251	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
252
253	mulld   r8, r0, r11		C U*m[j] low
254	mulhdu  r9, r0, r11		C U*m[j] high
255	addc    r8, r8, r13		C add T0 and low word
256	ld      r0, 56(r5)		C y[j+1]
257	adde    r13, r9, r14		C add high word with carry to T1
258	addze   r10, r16		C carry to CY
259	std     r8, 40(r1)		C store tmp[j-1]
260	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
261	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
262
263C Pass for j = 7
264
265	mulld   r8, r0, r12		C x[i]*y[j] low half
266	mulhdu  r9, r0, r12		C x[i]*y[j] high half
267	ld      r0, 56(r6)		C m[j]
268	addc    r13, r8, r13		C add low word to T0
269	adde    r14, r9, r10		C add high word with carry + CY to T1
270	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
271
272	mulld   r8, r0, r11		C U*m[j] low
273	mulhdu  r9, r0, r11		C U*m[j] high
274	addc    r8, r8, r13		C add T0 and low word
275	ld      r0, 64(r5)		C y[j+1]
276	adde    r13, r9, r14		C add high word with carry to T1
277	addze   r10, r16		C carry to CY
278	std     r8, 48(r1)		C store tmp[j-1]
279	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
280	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
281
282C Pass for j = 8
283
284	mulld   r8, r0, r12		C x[i]*y[j] low half
285	mulhdu  r9, r0, r12		C x[i]*y[j] high half
286	ld      r0, 64(r6)		C m[j]
287	addc    r13, r8, r13		C add low word to T0
288	adde    r14, r9, r10		C add high word with carry + CY to T1
289	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
290
291	mulld   r8, r0, r11		C U*m[j] low
292	mulhdu  r9, r0, r11		C U*m[j] high
293	addc    r8, r8, r13		C add T0 and low word
294	ld      r0, 72(r5)		C y[j+1]
295	adde    r13, r9, r14		C add high word with carry to T1
296	addze   r10, r16		C carry to CY
297	std     r8, 56(r1)		C store tmp[j-1]
298	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
299	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
300
301C Pass for j = 9
302
303	mulld   r8, r0, r12		C x[i]*y[j] low half
304	mulhdu  r9, r0, r12		C x[i]*y[j] high half
305	ld      r0, 72(r6)		C m[j]
306	addc    r13, r8, r13		C add low word to T0
307	adde    r14, r9, r10		C add high word with carry + CY to T1
308	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
309
310	mulld   r8, r0, r11		C U*m[j] low
311	mulhdu  r9, r0, r11		C U*m[j] high
312	addc    r8, r8, r13		C add T0 and low word
313	ld      r0, 80(r5)		C y[j+1]
314	adde    r13, r9, r14		C add high word with carry to T1
315	addze   r10, r16		C carry to CY
316	std     r8, 64(r1)		C store tmp[j-1]
317	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
318	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
319
320C Pass for j = 10
321
322	mulld   r8, r0, r12		C x[i]*y[j] low half
323	mulhdu  r9, r0, r12		C x[i]*y[j] high half
324	ld      r0, 80(r6)		C m[j]
325	addc    r13, r8, r13		C add low word to T0
326	adde    r14, r9, r10		C add high word with carry + CY to T1
327	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
328
329	mulld   r8, r0, r11		C U*m[j] low
330	mulhdu  r9, r0, r11		C U*m[j] high
331	addc    r8, r8, r13		C add T0 and low word
332	ld      r0, 88(r5)		C y[j+1]
333	adde    r13, r9, r14		C add high word with carry to T1
334	addze   r10, r16		C carry to CY
335	std     r8, 72(r1)		C store tmp[j-1]
336	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
337	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
338
339C Pass for j = 11
340
341	mulld   r8, r0, r12		C x[i]*y[j] low half
342	mulhdu  r9, r0, r12		C x[i]*y[j] high half
343	ld      r0, 88(r6)		C m[j]
344	addc    r13, r8, r13		C add low word to T0
345	adde    r14, r9, r10		C add high word with carry + CY to T1
346	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
347
348	mulld   r8, r0, r11		C U*m[j] low
349	mulhdu  r9, r0, r11		C U*m[j] high
350	addc    r8, r8, r13		C add T0 and low word
351	ld      r0, 96(r5)		C y[j+1]
352	adde    r13, r9, r14		C add high word with carry to T1
353	addze   r10, r16		C carry to CY
354	std     r8, 80(r1)		C store tmp[j-1]
355	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
356	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
357
358C Pass for j = 12
359
360	mulld   r8, r0, r12		C x[i]*y[j] low half
361	mulhdu  r9, r0, r12		C x[i]*y[j] high half
362	ld      r0, 96(r6)		C m[j]
363	addc    r13, r8, r13		C add low word to T0
364	adde    r14, r9, r10		C add high word with carry + CY to T1
365	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
366
367	mulld   r8, r0, r11		C U*m[j] low
368	mulhdu  r9, r0, r11		C U*m[j] high
369	addc    r8, r8, r13		C add T0 and low word
370	ld      r0, 104(r5)		C y[j+1]
371	adde    r13, r9, r14		C add high word with carry to T1
372	addze   r10, r16		C carry to CY
373	std     r8, 88(r1)		C store tmp[j-1]
374	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
375	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
376
377C Pass for j = 13
378
379	mulld   r8, r0, r12		C x[i]*y[j] low half
380	mulhdu  r9, r0, r12		C x[i]*y[j] high half
381	ld      r0, 104(r6)		C m[j]
382	addc    r13, r8, r13		C add low word to T0
383	adde    r14, r9, r10		C add high word with carry + CY to T1
384	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
385
386	mulld   r8, r0, r11		C U*m[j] low
387	mulhdu  r9, r0, r11		C U*m[j] high
388	addc    r8, r8, r13		C add T0 and low word
389	ld      r0, 112(r5)		C y[j+1]
390	adde    r13, r9, r14		C add high word with carry to T1
391	addze   r10, r16		C carry to CY
392	std     r8, 96(r1)		C store tmp[j-1]
393	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
394	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
395
396C Pass for j = 14. Don't fetch new data from y[j+1].
397
398	mulld   r8, r0, r12		C x[i]*y[j] low half
399	mulhdu  r9, r0, r12		C x[i]*y[j] high half
400	ld      r0, 112(r6)		C m[j]
401	addc    r13, r8, r13		C add low word to T0
402	adde    r14, r9, r10		C add high word with carry + CY to T1
403	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
404
405	mulld   r8, r0, r11		C U*m[j] low
406	mulhdu  r9, r0, r11		C U*m[j] high
407	addc    r8, r8, r13		C add T0 and low word
408	adde    r13, r9, r14		C add high word with carry to T1
409	std     r8, 104(r1)		C store tmp[len-2]
410	addze   r15, r16		C put carry in r15 (tmp[len] <= 1)
411	std     r13, 112(r1)		C store tmp[len-1]
412
413
414C #########################################################################
415C # i > 0 passes
416C #########################################################################
417
418
419	li      r9, 14			C outer loop count
420	mtctr   r9
421
4221:
423
424C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory
425C and compute the new u
426
427	ldu     r12, 8(r4)		C x[i]
428	ld      r0, 0(r5)		C y[0]
429	ld      r13, 0(r1)		C tmp[0]
430	mulld   r8, r0, r12		C x[i]*y[0] low half
431	ld      r14, 8(r1)		C tmp[1]
432	mulhdu  r9, r0, r12		C x[i]*y[0] high half
433	addc    r13, r8, r13		C T0
434	ld      r0, 0(r6)		C m[0]
435	mulld   r11, r7, r13		C U = T0*invm mod 2^64
436	adde    r14, r9, r14		C T1
437	mulld   r8, r0, r11		C U*m[0] low
438	addze   r10, r16		C CY
439	mulhdu  r9, r0, r11		C U*m[0] high
440	ld      r0, 8(r5)		C y[1]
441	addc    r8, r8, r13		C result = 0
442	adde    r13, r9, r14		C T0, carry pending
443	C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
444	C so cy:T1 <= 3*2^64 - 4
445
446C Pass for j = 1
447
448	ld      r14, 16(r1)		C tmp[j+1]
449	mulld   r8, r0, r12		C x[i]*y[j] low half
450	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
451	addze   r10, r16		C carry to CY
452	mulhdu  r9, r0, r12		C x[i]*y[j] high half
453	ld      r0, 8(r6)		C m[j]
454	addc    r13, r8, r13		C add low word to T0
455	mulld   r8, r0, r11		C U*m[j] low
456	adde    r14, r9, r14		C add high to T1
457	addze   r10, r10		C add carry to CY
458	mulhdu  r9, r0, r11		C U*m[j] high
459	addc    r8, r8, r13		C add T0 and low word
460	ld      r0, 16(r5)		C y[j+1]
461	adde    r13, r9, r14		C T1, carry pending
462	std     r8, 0(r1)		C store tmp[j-1]
463	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
464	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
465
466C Pass for j = 2
467
468	ld      r14, 24(r1)		C tmp[j+1]
469	mulld   r8, r0, r12		C x[i]*y[j] low half
470	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
471	addze   r10, r16		C carry to CY
472	mulhdu  r9, r0, r12		C x[i]*y[j] high half
473	ld      r0, 16(r6)		C m[j]
474	addc    r13, r8, r13		C add low word to T0
475	mulld   r8, r0, r11		C U*m[j] low
476	adde    r14, r9, r14		C add high to T1
477	addze   r10, r10		C add carry to CY
478	mulhdu  r9, r0, r11		C U*m[j] high
479	addc    r8, r8, r13		C add T0 and low word
480	ld      r0, 24(r5)		C y[j+1]
481	adde    r13, r9, r14		C T1, carry pending
482	std     r8, 8(r1)		C store tmp[j-1]
483	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
484	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
485
486C Pass for j = 3
487
488	ld      r14, 32(r1)		C tmp[j+1]
489	mulld   r8, r0, r12		C x[i]*y[j] low half
490	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
491	addze   r10, r16		C carry to CY
492	mulhdu  r9, r0, r12		C x[i]*y[j] high half
493	ld      r0, 24(r6)		C m[j]
494	addc    r13, r8, r13		C add low word to T0
495	mulld   r8, r0, r11		C U*m[j] low
496	adde    r14, r9, r14		C add high to T1
497	addze   r10, r10		C add carry to CY
498	mulhdu  r9, r0, r11		C U*m[j] high
499	addc    r8, r8, r13		C add T0 and low word
500	ld      r0, 32(r5)		C y[j+1]
501	adde    r13, r9, r14		C T1, carry pending
502	std     r8, 16(r1)		C store tmp[j-1]
503	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
504	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
505
506C Pass for j = 4
507
508	ld      r14, 40(r1)		C tmp[j+1]
509	mulld   r8, r0, r12		C x[i]*y[j] low half
510	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
511	addze   r10, r16		C carry to CY
512	mulhdu  r9, r0, r12		C x[i]*y[j] high half
513	ld      r0, 32(r6)		C m[j]
514	addc    r13, r8, r13		C add low word to T0
515	mulld   r8, r0, r11		C U*m[j] low
516	adde    r14, r9, r14		C add high to T1
517	addze   r10, r10		C add carry to CY
518	mulhdu  r9, r0, r11		C U*m[j] high
519	addc    r8, r8, r13		C add T0 and low word
520	ld      r0, 40(r5)		C y[j+1]
521	adde    r13, r9, r14		C T1, carry pending
522	std     r8, 24(r1)		C store tmp[j-1]
523	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
524	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
525
526C Pass for j = 5
527
528	ld      r14, 48(r1)		C tmp[j+1]
529	mulld   r8, r0, r12		C x[i]*y[j] low half
530	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
531	addze   r10, r16		C carry to CY
532	mulhdu  r9, r0, r12		C x[i]*y[j] high half
533	ld      r0, 40(r6)		C m[j]
534	addc    r13, r8, r13		C add low word to T0
535	mulld   r8, r0, r11		C U*m[j] low
536	adde    r14, r9, r14		C add high to T1
537	addze   r10, r10		C add carry to CY
538	mulhdu  r9, r0, r11		C U*m[j] high
539	addc    r8, r8, r13		C add T0 and low word
540	ld      r0, 48(r5)		C y[j+1]
541	adde    r13, r9, r14		C T1, carry pending
542	std     r8, 32(r1)		C store tmp[j-1]
543	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
544	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
545
546C Pass for j = 6
547
548	ld      r14, 56(r1)		C tmp[j+1]
549	mulld   r8, r0, r12		C x[i]*y[j] low half
550	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
551	addze   r10, r16		C carry to CY
552	mulhdu  r9, r0, r12		C x[i]*y[j] high half
553	ld      r0, 48(r6)		C m[j]
554	addc    r13, r8, r13		C add low word to T0
555	mulld   r8, r0, r11		C U*m[j] low
556	adde    r14, r9, r14		C add high to T1
557	addze   r10, r10		C add carry to CY
558	mulhdu  r9, r0, r11		C U*m[j] high
559	addc    r8, r8, r13		C add T0 and low word
560	ld      r0, 56(r5)		C y[j+1]
561	adde    r13, r9, r14		C T1, carry pending
562	std     r8, 40(r1)		C store tmp[j-1]
563	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
564	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
565
566C Pass for j = 7
567
568	ld      r14, 64(r1)		C tmp[j+1]
569	mulld   r8, r0, r12		C x[i]*y[j] low half
570	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
571	addze   r10, r16		C carry to CY
572	mulhdu  r9, r0, r12		C x[i]*y[j] high half
573	ld      r0, 56(r6)		C m[j]
574	addc    r13, r8, r13		C add low word to T0
575	mulld   r8, r0, r11		C U*m[j] low
576	adde    r14, r9, r14		C add high to T1
577	addze   r10, r10		C add carry to CY
578	mulhdu  r9, r0, r11		C U*m[j] high
579	addc    r8, r8, r13		C add T0 and low word
580	ld      r0, 64(r5)		C y[j+1]
581	adde    r13, r9, r14		C T1, carry pending
582	std     r8, 48(r1)		C store tmp[j-1]
583	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
584	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
585
586C Pass for j = 8
587
588	ld      r14, 72(r1)		C tmp[j+1]
589	mulld   r8, r0, r12		C x[i]*y[j] low half
590	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
591	addze   r10, r16		C carry to CY
592	mulhdu  r9, r0, r12		C x[i]*y[j] high half
593	ld      r0, 64(r6)		C m[j]
594	addc    r13, r8, r13		C add low word to T0
595	mulld   r8, r0, r11		C U*m[j] low
596	adde    r14, r9, r14		C add high to T1
597	addze   r10, r10		C add carry to CY
598	mulhdu  r9, r0, r11		C U*m[j] high
599	addc    r8, r8, r13		C add T0 and low word
600	ld      r0, 72(r5)		C y[j+1]
601	adde    r13, r9, r14		C T1, carry pending
602	std     r8, 56(r1)		C store tmp[j-1]
603	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
604	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
605
606C Pass for j = 9
607
608	ld      r14, 80(r1)		C tmp[j+1]
609	mulld   r8, r0, r12		C x[i]*y[j] low half
610	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
611	addze   r10, r16		C carry to CY
612	mulhdu  r9, r0, r12		C x[i]*y[j] high half
613	ld      r0, 72(r6)		C m[j]
614	addc    r13, r8, r13		C add low word to T0
615	mulld   r8, r0, r11		C U*m[j] low
616	adde    r14, r9, r14		C add high to T1
617	addze   r10, r10		C add carry to CY
618	mulhdu  r9, r0, r11		C U*m[j] high
619	addc    r8, r8, r13		C add T0 and low word
620	ld      r0, 80(r5)		C y[j+1]
621	adde    r13, r9, r14		C T1, carry pending
622	std     r8, 64(r1)		C store tmp[j-1]
623	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
624	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
625
626C Pass for j = 10
627
628	ld      r14, 88(r1)		C tmp[j+1]
629	mulld   r8, r0, r12		C x[i]*y[j] low half
630	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
631	addze   r10, r16		C carry to CY
632	mulhdu  r9, r0, r12		C x[i]*y[j] high half
633	ld      r0, 80(r6)		C m[j]
634	addc    r13, r8, r13		C add low word to T0
635	mulld   r8, r0, r11		C U*m[j] low
636	adde    r14, r9, r14		C add high to T1
637	addze   r10, r10		C add carry to CY
638	mulhdu  r9, r0, r11		C U*m[j] high
639	addc    r8, r8, r13		C add T0 and low word
640	ld      r0, 88(r5)		C y[j+1]
641	adde    r13, r9, r14		C T1, carry pending
642	std     r8, 72(r1)		C store tmp[j-1]
643	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
644	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
645
646C Pass for j = 11
647
648	ld      r14, 96(r1)		C tmp[j+1]
649	mulld   r8, r0, r12		C x[i]*y[j] low half
650	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
651	addze   r10, r16		C carry to CY
652	mulhdu  r9, r0, r12		C x[i]*y[j] high half
653	ld      r0, 88(r6)		C m[j]
654	addc    r13, r8, r13		C add low word to T0
655	mulld   r8, r0, r11		C U*m[j] low
656	adde    r14, r9, r14		C add high to T1
657	addze   r10, r10		C add carry to CY
658	mulhdu  r9, r0, r11		C U*m[j] high
659	addc    r8, r8, r13		C add T0 and low word
660	ld      r0, 96(r5)		C y[j+1]
661	adde    r13, r9, r14		C T1, carry pending
662	std     r8, 80(r1)		C store tmp[j-1]
663	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
664	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
665
666C Pass for j = 12
667
668	ld      r14, 104(r1)		C tmp[j+1]
669	mulld   r8, r0, r12		C x[i]*y[j] low half
670	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
671	addze   r10, r16		C carry to CY
672	mulhdu  r9, r0, r12		C x[i]*y[j] high half
673	ld      r0, 96(r6)		C m[j]
674	addc    r13, r8, r13		C add low word to T0
675	mulld   r8, r0, r11		C U*m[j] low
676	adde    r14, r9, r14		C add high to T1
677	addze   r10, r10		C add carry to CY
678	mulhdu  r9, r0, r11		C U*m[j] high
679	addc    r8, r8, r13		C add T0 and low word
680	ld      r0, 104(r5)		C y[j+1]
681	adde    r13, r9, r14		C T1, carry pending
682	std     r8, 88(r1)		C store tmp[j-1]
683	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
684	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
685
686C Pass for j = 13
687
688	ld      r14, 112(r1)		C tmp[j+1]
689	mulld   r8, r0, r12		C x[i]*y[j] low half
690	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
691	addze   r10, r16		C carry to CY
692	mulhdu  r9, r0, r12		C x[i]*y[j] high half
693	ld      r0, 104(r6)		C m[j]
694	addc    r13, r8, r13		C add low word to T0
695	mulld   r8, r0, r11		C U*m[j] low
696	adde    r14, r9, r14		C add high to T1
697	addze   r10, r10		C add carry to CY
698	mulhdu  r9, r0, r11		C U*m[j] high
699	addc    r8, r8, r13		C add T0 and low word
700	ld      r0, 112(r5)		C y[j+1]
701	adde    r13, r9, r14		C T1, carry pending
702	std     r8, 96(r1)		C store tmp[j-1]
703	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
704	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
705
706C Pass for j = 14. Don't fetch new data from y[j+1].
707
708	mulld   r8, r0, r12		C x[i]*y[j] low half
709	adde    r14, r15, r10		C T1 = tmp[len] + CY + pending carry
710	C since tmp[len] <= 1, T1 <= 3 and carry is zero
711	mulhdu  r9, r0, r12		C x[i]*y[j] high half
712	ld      r0, 112(r6)		C m[j]
713	addc    r13, r8, r13		C add low word to T0
714	mulld   r8, r0, r11		C U*m[j] low
715	adde    r14, r9, r14		C add high to T1
716	addze   r10, r16		C CY
717	mulhdu  r9, r0, r11		C U*m[j] high
718	addc    r8, r8, r13		C add T0 and low word
719	adde    r13, r9, r14		C T1, carry pending
720	std     r8, 104(r1)		C store tmp[len-2]
721	addze   r15, r10		C store tmp[len] <= 1
722	std     r13, 112(r1)		C store tmp[len-1]
723	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
724	C          <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1)
725
726	bdnz 1b
727
728C Copy result from tmp memory to z
729
730	ld      r8, 0(r1)
731	ldu     r9, 8(r1)
732	std     r8, 0(r3)
733	stdu    r9, 8(r3)
734	ldu     r8, 8(r1)
735	ldu     r9, 8(r1)
736	stdu    r8, 8(r3)
737	stdu    r9, 8(r3)
738	ldu     r8, 8(r1)
739	ldu     r9, 8(r1)
740	stdu    r8, 8(r3)
741	stdu    r9, 8(r3)
742	ldu     r8, 8(r1)
743	ldu     r9, 8(r1)
744	stdu    r8, 8(r3)
745	stdu    r9, 8(r3)
746	ldu     r8, 8(r1)
747	ldu     r9, 8(r1)
748	stdu    r8, 8(r3)
749	stdu    r9, 8(r3)
750	ldu     r8, 8(r1)
751	ldu     r9, 8(r1)
752	stdu    r8, 8(r3)
753	stdu    r9, 8(r3)
754	ldu     r8, 8(r1)
755	ldu     r9, 8(r1)
756	stdu    r8, 8(r3)
757	stdu    r9, 8(r3)
758	ldu     r8, 8(r1)
759	stdu    r8, 8(r3)
760
761	mr      r3, r15         C return tmp(len)
762	ldu     r16, 8(r1)
763	ldu     r15, 8(r1)
764	ldu     r14, 8(r1)
765	ldu     r13, 8(r1)
766	addi    r1, r1, 8
767	blr
768
769	.size	.GSYM_PREFIX`'mulredc15, .-.GSYM_PREFIX`'mulredc15
770
771