1dnl ******************************************************************************
2dnl   Copyright 2009 Paul Zimmermann and Alexander Kruppa.
3dnl
4dnl   This file is part of the ECM Library.
5dnl
6dnl   The ECM Library is free software; you can redistribute it and/or modify
7dnl   it under the terms of the GNU Lesser General Public License as published by
8dnl   the Free Software Foundation; either version 3 of the License, or (at your
9dnl   option) any later version.
10dnl
11dnl   The ECM Library is distributed in the hope that it will be useful, but
12dnl   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13dnl   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14dnl   License for more details.
15dnl
16dnl   You should have received a copy of the GNU Lesser General Public License
17dnl   along with the ECM Library; see the file COPYING.LIB.  If not, write to
18dnl   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19dnl   MA 02110-1301, USA.
20dnl ******************************************************************************
21
22define(C, `
23dnl')
24
25C mp_limb_t mulredc14(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
26C                 const mp_limb_t *m, mp_limb_t inv_m);
27C
28C arguments:
29C r3 = ptr to result z least significant limb
30C r4 = ptr to input x least significant limb
31C r5 = ptr to input y least significant limb
32C r6 = ptr to modulus m least significant limb
33C r7 = -1/m mod 2^64
34C
35C final carry returned in r3
36
37
38
39include(`config.m4')
40
41	GLOBL GSYM_PREFIX`'mulredc14
42	GLOBL .GSYM_PREFIX`'mulredc14
43
44	.section ".opd", "aw"
45	.align	3
46GSYM_PREFIX`'mulredc14:
47	.quad	.GSYM_PREFIX`'mulredc14, .TOC.@tocbase, 0
48	.size	GSYM_PREFIX`'mulredc14, 24
49
50
51C Implements multiplication and REDC for two input numbers of 14 words
52
53C The algorithm:
54C   (Notation: a:b:c == a * 2^128 + b * 2^64 + c)
55C
56C T1:T0 = x[i]*y[0] ;
57C u = (T0*invm) % 2^64 ;
58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */
59C for (j = 1; j < len; j++)
60C   {
61C     cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ;
62C        /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */
63C     tmp[j-1] = T0;
64C   }
65C tmp[len-1] = T1 ;
66C tmp[len] = cy ; /* cy <= 1 (see note 2) */
67C for (i = 1; i < len; i++)
68C   {
69C     cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ;
70C     u = (T0*invm) % 2^64 ;
71C     cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */
72C     for (j = 1; j < len; j++)
73C       {
74C         cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ;
75C         /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3
76C            for j = (len-1), result cy:T1 <= 2*2^64 - 1  (see note 4) */
77C         tmp[j-1] = T0;
78C       }
79C     tmp[len-1] = T1 ;
80C     tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */
81C   }
82C z[0 ... len-1] = tmp[0 ... len-1] ;
83C return (tmp[len]) ;
84C
85C notes:
86C
87C 1:  m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2,
88C     so cy:T1 <= 2*2^64 - 4.
89C 2:  For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4
90C                 <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2),
91C     so cy:T1 <= 2*2^64 - 3. For j > 1,
92C     x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1),
93C     so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j.
94C 3:  m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
95C     so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4)
96C 4:  For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1
97C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64
98C                  <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2),
99C     so cy:T1 <= 3*2^64 - 3. For j > 1,
100C     x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1),
101C     so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1.
102C     For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0.
103C     Assume this is true for index i-1, Then
104C                x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1
105C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
106C                  <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1),
107C     so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction.
108C
109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11
110C                YP = r5, MP = r6, TP = r1 (stack ptr)
111C
112
113C local variables: tmp[0 ... 14] array, having 14+1 8-byte words
114C The tmp array needs 14+1 entries, but tmp[14] is stored in
115C r15, so only 14 entries are used in the stack.
116
117
118	TEXT
119	.align	5	C powerPC 32 byte alignment
120.GSYM_PREFIX`'mulredc14:
121
122C ########################################################################
123C # i = 0 pass
124C #########################################################################
125
126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u
127
128	ld      r12, 0(r4)		C XI = x[0]
129	ld      r0, 0(r5)		C y[0]
130	stdu    r13, -8(r1)		C save r13
131	mulld   r8, r0, r12		C x[0]*y[0] low half
132	stdu    r14, -8(r1)		C save r14
133	mulhdu  r9, r0, r12		C x[0]*y[0] high half
134	ld      r0, 0(r6)		C m[0]
135	mulld   r11, r7, r8		C U = T0*invm mod 2^64
136	stdu    r15, -8(r1)		C save r15
137	mulld   r13, r0, r11		C T0 = U*m[0] low
138	stdu    r16, -8(r1)		C save r16
139	li      r16, 0			C set r16 to zero for carry propagation
140	subi    r1, r1, 112		C set tmp stack space
141	mulhdu  r14, r0, r11		C T1 = U*m[0] high
142	ld      r0, 8(r5)		C y[1]
143	addc    r8, r8, r13		C
144	adde    r13, r9, r14		C T0 = initial tmp(0)
145	addze   r10, r16		C carry to CY
146	C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
147	C CY:T1 <= 2*2^64 - 4
148
149C Pass for j = 1
150
151	mulld   r8, r0, r12		C x[i]*y[j] low half
152	mulhdu  r9, r0, r12		C x[i]*y[j] high half
153	ld      r0, 8(r6)		C m[j]
154	addc    r13, r8, r13		C add low word to T0
155	adde    r14, r9, r10		C add high word with carry + CY to T1
156	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
157
158	mulld   r8, r0, r11		C U*m[j] low
159	mulhdu  r9, r0, r11		C U*m[j] high
160	addc    r8, r8, r13		C add T0 and low word
161	ld      r0, 16(r5)		C y[j+1]
162	adde    r13, r9, r14		C add high word with carry to T1
163	addze   r10, r16		C carry to CY
164	std     r8, 0(r1)		C store tmp[j-1]
165	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
166	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
167
168C Pass for j = 2
169
170	mulld   r8, r0, r12		C x[i]*y[j] low half
171	mulhdu  r9, r0, r12		C x[i]*y[j] high half
172	ld      r0, 16(r6)		C m[j]
173	addc    r13, r8, r13		C add low word to T0
174	adde    r14, r9, r10		C add high word with carry + CY to T1
175	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
176
177	mulld   r8, r0, r11		C U*m[j] low
178	mulhdu  r9, r0, r11		C U*m[j] high
179	addc    r8, r8, r13		C add T0 and low word
180	ld      r0, 24(r5)		C y[j+1]
181	adde    r13, r9, r14		C add high word with carry to T1
182	addze   r10, r16		C carry to CY
183	std     r8, 8(r1)		C store tmp[j-1]
184	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
185	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
186
187C Pass for j = 3
188
189	mulld   r8, r0, r12		C x[i]*y[j] low half
190	mulhdu  r9, r0, r12		C x[i]*y[j] high half
191	ld      r0, 24(r6)		C m[j]
192	addc    r13, r8, r13		C add low word to T0
193	adde    r14, r9, r10		C add high word with carry + CY to T1
194	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
195
196	mulld   r8, r0, r11		C U*m[j] low
197	mulhdu  r9, r0, r11		C U*m[j] high
198	addc    r8, r8, r13		C add T0 and low word
199	ld      r0, 32(r5)		C y[j+1]
200	adde    r13, r9, r14		C add high word with carry to T1
201	addze   r10, r16		C carry to CY
202	std     r8, 16(r1)		C store tmp[j-1]
203	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
204	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
205
206C Pass for j = 4
207
208	mulld   r8, r0, r12		C x[i]*y[j] low half
209	mulhdu  r9, r0, r12		C x[i]*y[j] high half
210	ld      r0, 32(r6)		C m[j]
211	addc    r13, r8, r13		C add low word to T0
212	adde    r14, r9, r10		C add high word with carry + CY to T1
213	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
214
215	mulld   r8, r0, r11		C U*m[j] low
216	mulhdu  r9, r0, r11		C U*m[j] high
217	addc    r8, r8, r13		C add T0 and low word
218	ld      r0, 40(r5)		C y[j+1]
219	adde    r13, r9, r14		C add high word with carry to T1
220	addze   r10, r16		C carry to CY
221	std     r8, 24(r1)		C store tmp[j-1]
222	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
223	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
224
225C Pass for j = 5
226
227	mulld   r8, r0, r12		C x[i]*y[j] low half
228	mulhdu  r9, r0, r12		C x[i]*y[j] high half
229	ld      r0, 40(r6)		C m[j]
230	addc    r13, r8, r13		C add low word to T0
231	adde    r14, r9, r10		C add high word with carry + CY to T1
232	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
233
234	mulld   r8, r0, r11		C U*m[j] low
235	mulhdu  r9, r0, r11		C U*m[j] high
236	addc    r8, r8, r13		C add T0 and low word
237	ld      r0, 48(r5)		C y[j+1]
238	adde    r13, r9, r14		C add high word with carry to T1
239	addze   r10, r16		C carry to CY
240	std     r8, 32(r1)		C store tmp[j-1]
241	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
242	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
243
244C Pass for j = 6
245
246	mulld   r8, r0, r12		C x[i]*y[j] low half
247	mulhdu  r9, r0, r12		C x[i]*y[j] high half
248	ld      r0, 48(r6)		C m[j]
249	addc    r13, r8, r13		C add low word to T0
250	adde    r14, r9, r10		C add high word with carry + CY to T1
251	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
252
253	mulld   r8, r0, r11		C U*m[j] low
254	mulhdu  r9, r0, r11		C U*m[j] high
255	addc    r8, r8, r13		C add T0 and low word
256	ld      r0, 56(r5)		C y[j+1]
257	adde    r13, r9, r14		C add high word with carry to T1
258	addze   r10, r16		C carry to CY
259	std     r8, 40(r1)		C store tmp[j-1]
260	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
261	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
262
263C Pass for j = 7
264
265	mulld   r8, r0, r12		C x[i]*y[j] low half
266	mulhdu  r9, r0, r12		C x[i]*y[j] high half
267	ld      r0, 56(r6)		C m[j]
268	addc    r13, r8, r13		C add low word to T0
269	adde    r14, r9, r10		C add high word with carry + CY to T1
270	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
271
272	mulld   r8, r0, r11		C U*m[j] low
273	mulhdu  r9, r0, r11		C U*m[j] high
274	addc    r8, r8, r13		C add T0 and low word
275	ld      r0, 64(r5)		C y[j+1]
276	adde    r13, r9, r14		C add high word with carry to T1
277	addze   r10, r16		C carry to CY
278	std     r8, 48(r1)		C store tmp[j-1]
279	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
280	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
281
282C Pass for j = 8
283
284	mulld   r8, r0, r12		C x[i]*y[j] low half
285	mulhdu  r9, r0, r12		C x[i]*y[j] high half
286	ld      r0, 64(r6)		C m[j]
287	addc    r13, r8, r13		C add low word to T0
288	adde    r14, r9, r10		C add high word with carry + CY to T1
289	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
290
291	mulld   r8, r0, r11		C U*m[j] low
292	mulhdu  r9, r0, r11		C U*m[j] high
293	addc    r8, r8, r13		C add T0 and low word
294	ld      r0, 72(r5)		C y[j+1]
295	adde    r13, r9, r14		C add high word with carry to T1
296	addze   r10, r16		C carry to CY
297	std     r8, 56(r1)		C store tmp[j-1]
298	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
299	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
300
301C Pass for j = 9
302
303	mulld   r8, r0, r12		C x[i]*y[j] low half
304	mulhdu  r9, r0, r12		C x[i]*y[j] high half
305	ld      r0, 72(r6)		C m[j]
306	addc    r13, r8, r13		C add low word to T0
307	adde    r14, r9, r10		C add high word with carry + CY to T1
308	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
309
310	mulld   r8, r0, r11		C U*m[j] low
311	mulhdu  r9, r0, r11		C U*m[j] high
312	addc    r8, r8, r13		C add T0 and low word
313	ld      r0, 80(r5)		C y[j+1]
314	adde    r13, r9, r14		C add high word with carry to T1
315	addze   r10, r16		C carry to CY
316	std     r8, 64(r1)		C store tmp[j-1]
317	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
318	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
319
320C Pass for j = 10
321
322	mulld   r8, r0, r12		C x[i]*y[j] low half
323	mulhdu  r9, r0, r12		C x[i]*y[j] high half
324	ld      r0, 80(r6)		C m[j]
325	addc    r13, r8, r13		C add low word to T0
326	adde    r14, r9, r10		C add high word with carry + CY to T1
327	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
328
329	mulld   r8, r0, r11		C U*m[j] low
330	mulhdu  r9, r0, r11		C U*m[j] high
331	addc    r8, r8, r13		C add T0 and low word
332	ld      r0, 88(r5)		C y[j+1]
333	adde    r13, r9, r14		C add high word with carry to T1
334	addze   r10, r16		C carry to CY
335	std     r8, 72(r1)		C store tmp[j-1]
336	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
337	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
338
339C Pass for j = 11
340
341	mulld   r8, r0, r12		C x[i]*y[j] low half
342	mulhdu  r9, r0, r12		C x[i]*y[j] high half
343	ld      r0, 88(r6)		C m[j]
344	addc    r13, r8, r13		C add low word to T0
345	adde    r14, r9, r10		C add high word with carry + CY to T1
346	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
347
348	mulld   r8, r0, r11		C U*m[j] low
349	mulhdu  r9, r0, r11		C U*m[j] high
350	addc    r8, r8, r13		C add T0 and low word
351	ld      r0, 96(r5)		C y[j+1]
352	adde    r13, r9, r14		C add high word with carry to T1
353	addze   r10, r16		C carry to CY
354	std     r8, 80(r1)		C store tmp[j-1]
355	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
356	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
357
358C Pass for j = 12
359
360	mulld   r8, r0, r12		C x[i]*y[j] low half
361	mulhdu  r9, r0, r12		C x[i]*y[j] high half
362	ld      r0, 96(r6)		C m[j]
363	addc    r13, r8, r13		C add low word to T0
364	adde    r14, r9, r10		C add high word with carry + CY to T1
365	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
366
367	mulld   r8, r0, r11		C U*m[j] low
368	mulhdu  r9, r0, r11		C U*m[j] high
369	addc    r8, r8, r13		C add T0 and low word
370	ld      r0, 104(r5)		C y[j+1]
371	adde    r13, r9, r14		C add high word with carry to T1
372	addze   r10, r16		C carry to CY
373	std     r8, 88(r1)		C store tmp[j-1]
374	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
375	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
376
377C Pass for j = 13. Don't fetch new data from y[j+1].
378
379	mulld   r8, r0, r12		C x[i]*y[j] low half
380	mulhdu  r9, r0, r12		C x[i]*y[j] high half
381	ld      r0, 104(r6)		C m[j]
382	addc    r13, r8, r13		C add low word to T0
383	adde    r14, r9, r10		C add high word with carry + CY to T1
384	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
385
386	mulld   r8, r0, r11		C U*m[j] low
387	mulhdu  r9, r0, r11		C U*m[j] high
388	addc    r8, r8, r13		C add T0 and low word
389	adde    r13, r9, r14		C add high word with carry to T1
390	std     r8, 96(r1)		C store tmp[len-2]
391	addze   r15, r16		C put carry in r15 (tmp[len] <= 1)
392	std     r13, 104(r1)		C store tmp[len-1]
393
394
395C #########################################################################
396C # i > 0 passes
397C #########################################################################
398
399
400	li      r9, 13			C outer loop count
401	mtctr   r9
402
4031:
404
405C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory
406C and compute the new u
407
408	ldu     r12, 8(r4)		C x[i]
409	ld      r0, 0(r5)		C y[0]
410	ld      r13, 0(r1)		C tmp[0]
411	mulld   r8, r0, r12		C x[i]*y[0] low half
412	ld      r14, 8(r1)		C tmp[1]
413	mulhdu  r9, r0, r12		C x[i]*y[0] high half
414	addc    r13, r8, r13		C T0
415	ld      r0, 0(r6)		C m[0]
416	mulld   r11, r7, r13		C U = T0*invm mod 2^64
417	adde    r14, r9, r14		C T1
418	mulld   r8, r0, r11		C U*m[0] low
419	addze   r10, r16		C CY
420	mulhdu  r9, r0, r11		C U*m[0] high
421	ld      r0, 8(r5)		C y[1]
422	addc    r8, r8, r13		C result = 0
423	adde    r13, r9, r14		C T0, carry pending
424	C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
425	C so cy:T1 <= 3*2^64 - 4
426
427C Pass for j = 1
428
429	ld      r14, 16(r1)		C tmp[j+1]
430	mulld   r8, r0, r12		C x[i]*y[j] low half
431	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
432	addze   r10, r16		C carry to CY
433	mulhdu  r9, r0, r12		C x[i]*y[j] high half
434	ld      r0, 8(r6)		C m[j]
435	addc    r13, r8, r13		C add low word to T0
436	mulld   r8, r0, r11		C U*m[j] low
437	adde    r14, r9, r14		C add high to T1
438	addze   r10, r10		C add carry to CY
439	mulhdu  r9, r0, r11		C U*m[j] high
440	addc    r8, r8, r13		C add T0 and low word
441	ld      r0, 16(r5)		C y[j+1]
442	adde    r13, r9, r14		C T1, carry pending
443	std     r8, 0(r1)		C store tmp[j-1]
444	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
445	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
446
447C Pass for j = 2
448
449	ld      r14, 24(r1)		C tmp[j+1]
450	mulld   r8, r0, r12		C x[i]*y[j] low half
451	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
452	addze   r10, r16		C carry to CY
453	mulhdu  r9, r0, r12		C x[i]*y[j] high half
454	ld      r0, 16(r6)		C m[j]
455	addc    r13, r8, r13		C add low word to T0
456	mulld   r8, r0, r11		C U*m[j] low
457	adde    r14, r9, r14		C add high to T1
458	addze   r10, r10		C add carry to CY
459	mulhdu  r9, r0, r11		C U*m[j] high
460	addc    r8, r8, r13		C add T0 and low word
461	ld      r0, 24(r5)		C y[j+1]
462	adde    r13, r9, r14		C T1, carry pending
463	std     r8, 8(r1)		C store tmp[j-1]
464	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
465	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
466
467C Pass for j = 3
468
469	ld      r14, 32(r1)		C tmp[j+1]
470	mulld   r8, r0, r12		C x[i]*y[j] low half
471	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
472	addze   r10, r16		C carry to CY
473	mulhdu  r9, r0, r12		C x[i]*y[j] high half
474	ld      r0, 24(r6)		C m[j]
475	addc    r13, r8, r13		C add low word to T0
476	mulld   r8, r0, r11		C U*m[j] low
477	adde    r14, r9, r14		C add high to T1
478	addze   r10, r10		C add carry to CY
479	mulhdu  r9, r0, r11		C U*m[j] high
480	addc    r8, r8, r13		C add T0 and low word
481	ld      r0, 32(r5)		C y[j+1]
482	adde    r13, r9, r14		C T1, carry pending
483	std     r8, 16(r1)		C store tmp[j-1]
484	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
485	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
486
487C Pass for j = 4
488
489	ld      r14, 40(r1)		C tmp[j+1]
490	mulld   r8, r0, r12		C x[i]*y[j] low half
491	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
492	addze   r10, r16		C carry to CY
493	mulhdu  r9, r0, r12		C x[i]*y[j] high half
494	ld      r0, 32(r6)		C m[j]
495	addc    r13, r8, r13		C add low word to T0
496	mulld   r8, r0, r11		C U*m[j] low
497	adde    r14, r9, r14		C add high to T1
498	addze   r10, r10		C add carry to CY
499	mulhdu  r9, r0, r11		C U*m[j] high
500	addc    r8, r8, r13		C add T0 and low word
501	ld      r0, 40(r5)		C y[j+1]
502	adde    r13, r9, r14		C T1, carry pending
503	std     r8, 24(r1)		C store tmp[j-1]
504	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
505	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
506
507C Pass for j = 5
508
509	ld      r14, 48(r1)		C tmp[j+1]
510	mulld   r8, r0, r12		C x[i]*y[j] low half
511	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
512	addze   r10, r16		C carry to CY
513	mulhdu  r9, r0, r12		C x[i]*y[j] high half
514	ld      r0, 40(r6)		C m[j]
515	addc    r13, r8, r13		C add low word to T0
516	mulld   r8, r0, r11		C U*m[j] low
517	adde    r14, r9, r14		C add high to T1
518	addze   r10, r10		C add carry to CY
519	mulhdu  r9, r0, r11		C U*m[j] high
520	addc    r8, r8, r13		C add T0 and low word
521	ld      r0, 48(r5)		C y[j+1]
522	adde    r13, r9, r14		C T1, carry pending
523	std     r8, 32(r1)		C store tmp[j-1]
524	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
525	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
526
527C Pass for j = 6
528
529	ld      r14, 56(r1)		C tmp[j+1]
530	mulld   r8, r0, r12		C x[i]*y[j] low half
531	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
532	addze   r10, r16		C carry to CY
533	mulhdu  r9, r0, r12		C x[i]*y[j] high half
534	ld      r0, 48(r6)		C m[j]
535	addc    r13, r8, r13		C add low word to T0
536	mulld   r8, r0, r11		C U*m[j] low
537	adde    r14, r9, r14		C add high to T1
538	addze   r10, r10		C add carry to CY
539	mulhdu  r9, r0, r11		C U*m[j] high
540	addc    r8, r8, r13		C add T0 and low word
541	ld      r0, 56(r5)		C y[j+1]
542	adde    r13, r9, r14		C T1, carry pending
543	std     r8, 40(r1)		C store tmp[j-1]
544	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
545	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
546
547C Pass for j = 7
548
549	ld      r14, 64(r1)		C tmp[j+1]
550	mulld   r8, r0, r12		C x[i]*y[j] low half
551	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
552	addze   r10, r16		C carry to CY
553	mulhdu  r9, r0, r12		C x[i]*y[j] high half
554	ld      r0, 56(r6)		C m[j]
555	addc    r13, r8, r13		C add low word to T0
556	mulld   r8, r0, r11		C U*m[j] low
557	adde    r14, r9, r14		C add high to T1
558	addze   r10, r10		C add carry to CY
559	mulhdu  r9, r0, r11		C U*m[j] high
560	addc    r8, r8, r13		C add T0 and low word
561	ld      r0, 64(r5)		C y[j+1]
562	adde    r13, r9, r14		C T1, carry pending
563	std     r8, 48(r1)		C store tmp[j-1]
564	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
565	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
566
567C Pass for j = 8
568
569	ld      r14, 72(r1)		C tmp[j+1]
570	mulld   r8, r0, r12		C x[i]*y[j] low half
571	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
572	addze   r10, r16		C carry to CY
573	mulhdu  r9, r0, r12		C x[i]*y[j] high half
574	ld      r0, 64(r6)		C m[j]
575	addc    r13, r8, r13		C add low word to T0
576	mulld   r8, r0, r11		C U*m[j] low
577	adde    r14, r9, r14		C add high to T1
578	addze   r10, r10		C add carry to CY
579	mulhdu  r9, r0, r11		C U*m[j] high
580	addc    r8, r8, r13		C add T0 and low word
581	ld      r0, 72(r5)		C y[j+1]
582	adde    r13, r9, r14		C T1, carry pending
583	std     r8, 56(r1)		C store tmp[j-1]
584	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
585	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
586
587C Pass for j = 9
588
589	ld      r14, 80(r1)		C tmp[j+1]
590	mulld   r8, r0, r12		C x[i]*y[j] low half
591	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
592	addze   r10, r16		C carry to CY
593	mulhdu  r9, r0, r12		C x[i]*y[j] high half
594	ld      r0, 72(r6)		C m[j]
595	addc    r13, r8, r13		C add low word to T0
596	mulld   r8, r0, r11		C U*m[j] low
597	adde    r14, r9, r14		C add high to T1
598	addze   r10, r10		C add carry to CY
599	mulhdu  r9, r0, r11		C U*m[j] high
600	addc    r8, r8, r13		C add T0 and low word
601	ld      r0, 80(r5)		C y[j+1]
602	adde    r13, r9, r14		C T1, carry pending
603	std     r8, 64(r1)		C store tmp[j-1]
604	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
605	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
606
607C Pass for j = 10
608
609	ld      r14, 88(r1)		C tmp[j+1]
610	mulld   r8, r0, r12		C x[i]*y[j] low half
611	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
612	addze   r10, r16		C carry to CY
613	mulhdu  r9, r0, r12		C x[i]*y[j] high half
614	ld      r0, 80(r6)		C m[j]
615	addc    r13, r8, r13		C add low word to T0
616	mulld   r8, r0, r11		C U*m[j] low
617	adde    r14, r9, r14		C add high to T1
618	addze   r10, r10		C add carry to CY
619	mulhdu  r9, r0, r11		C U*m[j] high
620	addc    r8, r8, r13		C add T0 and low word
621	ld      r0, 88(r5)		C y[j+1]
622	adde    r13, r9, r14		C T1, carry pending
623	std     r8, 72(r1)		C store tmp[j-1]
624	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
625	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
626
627C Pass for j = 11
628
629	ld      r14, 96(r1)		C tmp[j+1]
630	mulld   r8, r0, r12		C x[i]*y[j] low half
631	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
632	addze   r10, r16		C carry to CY
633	mulhdu  r9, r0, r12		C x[i]*y[j] high half
634	ld      r0, 88(r6)		C m[j]
635	addc    r13, r8, r13		C add low word to T0
636	mulld   r8, r0, r11		C U*m[j] low
637	adde    r14, r9, r14		C add high to T1
638	addze   r10, r10		C add carry to CY
639	mulhdu  r9, r0, r11		C U*m[j] high
640	addc    r8, r8, r13		C add T0 and low word
641	ld      r0, 96(r5)		C y[j+1]
642	adde    r13, r9, r14		C T1, carry pending
643	std     r8, 80(r1)		C store tmp[j-1]
644	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
645	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
646
647C Pass for j = 12
648
649	ld      r14, 104(r1)		C tmp[j+1]
650	mulld   r8, r0, r12		C x[i]*y[j] low half
651	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
652	addze   r10, r16		C carry to CY
653	mulhdu  r9, r0, r12		C x[i]*y[j] high half
654	ld      r0, 96(r6)		C m[j]
655	addc    r13, r8, r13		C add low word to T0
656	mulld   r8, r0, r11		C U*m[j] low
657	adde    r14, r9, r14		C add high to T1
658	addze   r10, r10		C add carry to CY
659	mulhdu  r9, r0, r11		C U*m[j] high
660	addc    r8, r8, r13		C add T0 and low word
661	ld      r0, 104(r5)		C y[j+1]
662	adde    r13, r9, r14		C T1, carry pending
663	std     r8, 88(r1)		C store tmp[j-1]
664	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
665	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
666
667C Pass for j = 13. Don't fetch new data from y[j+1].
668
669	mulld   r8, r0, r12		C x[i]*y[j] low half
670	adde    r14, r15, r10		C T1 = tmp[len] + CY + pending carry
671	C since tmp[len] <= 1, T1 <= 3 and carry is zero
672	mulhdu  r9, r0, r12		C x[i]*y[j] high half
673	ld      r0, 104(r6)		C m[j]
674	addc    r13, r8, r13		C add low word to T0
675	mulld   r8, r0, r11		C U*m[j] low
676	adde    r14, r9, r14		C add high to T1
677	addze   r10, r16		C CY
678	mulhdu  r9, r0, r11		C U*m[j] high
679	addc    r8, r8, r13		C add T0 and low word
680	adde    r13, r9, r14		C T1, carry pending
681	std     r8, 96(r1)		C store tmp[len-2]
682	addze   r15, r10		C store tmp[len] <= 1
683	std     r13, 104(r1)		C store tmp[len-1]
684	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
685	C          <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1)
686
687	bdnz 1b
688
689C Copy result from tmp memory to z
690
691	ld      r8, 0(r1)
692	ldu     r9, 8(r1)
693	std     r8, 0(r3)
694	stdu    r9, 8(r3)
695	ldu     r8, 8(r1)
696	ldu     r9, 8(r1)
697	stdu    r8, 8(r3)
698	stdu    r9, 8(r3)
699	ldu     r8, 8(r1)
700	ldu     r9, 8(r1)
701	stdu    r8, 8(r3)
702	stdu    r9, 8(r3)
703	ldu     r8, 8(r1)
704	ldu     r9, 8(r1)
705	stdu    r8, 8(r3)
706	stdu    r9, 8(r3)
707	ldu     r8, 8(r1)
708	ldu     r9, 8(r1)
709	stdu    r8, 8(r3)
710	stdu    r9, 8(r3)
711	ldu     r8, 8(r1)
712	ldu     r9, 8(r1)
713	stdu    r8, 8(r3)
714	stdu    r9, 8(r3)
715	ldu     r8, 8(r1)
716	ldu     r9, 8(r1)
717	stdu    r8, 8(r3)
718	stdu    r9, 8(r3)
719
720	mr      r3, r15         C return tmp(len)
721	ldu     r16, 8(r1)
722	ldu     r15, 8(r1)
723	ldu     r14, 8(r1)
724	ldu     r13, 8(r1)
725	addi    r1, r1, 8
726	blr
727
728	.size	.GSYM_PREFIX`'mulredc14, .-.GSYM_PREFIX`'mulredc14
729
730