1dnl ******************************************************************************
2dnl   Copyright 2009 Paul Zimmermann and Alexander Kruppa.
3dnl
4dnl   This file is part of the ECM Library.
5dnl
6dnl   The ECM Library is free software; you can redistribute it and/or modify
7dnl   it under the terms of the GNU Lesser General Public License as published by
8dnl   the Free Software Foundation; either version 3 of the License, or (at your
9dnl   option) any later version.
10dnl
11dnl   The ECM Library is distributed in the hope that it will be useful, but
12dnl   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13dnl   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
14dnl   License for more details.
15dnl
16dnl   You should have received a copy of the GNU Lesser General Public License
17dnl   along with the ECM Library; see the file COPYING.LIB.  If not, write to
18dnl   the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19dnl   MA 02110-1301, USA.
20dnl ******************************************************************************
21
22define(C, `
23dnl')
24
25C mp_limb_t mulredc12(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y,
26C                 const mp_limb_t *m, mp_limb_t inv_m);
27C
28C arguments:
29C r3 = ptr to result z least significant limb
30C r4 = ptr to input x least significant limb
31C r5 = ptr to input y least significant limb
32C r6 = ptr to modulus m least significant limb
33C r7 = -1/m mod 2^64
34C
35C final carry returned in r3
36
37
38
39include(`config.m4')
40
41	GLOBL GSYM_PREFIX`'mulredc12
42	GLOBL .GSYM_PREFIX`'mulredc12
43
44	.section ".opd", "aw"
45	.align	3
46GSYM_PREFIX`'mulredc12:
47	.quad	.GSYM_PREFIX`'mulredc12, .TOC.@tocbase, 0
48	.size	GSYM_PREFIX`'mulredc12, 24
49
50
51C Implements multiplication and REDC for two input numbers of 12 words
52
53C The algorithm:
54C   (Notation: a:b:c == a * 2^128 + b * 2^64 + c)
55C
56C T1:T0 = x[i]*y[0] ;
57C u = (T0*invm) % 2^64 ;
58C cy:T1 = (m[0]*u + T1:T0) / 2^64 ; /* cy:T1 <= 2*2^64 - 4 (see note 1) */
59C for (j = 1; j < len; j++)
60C   {
61C     cy:T1:T0 = x[i]*y[j] + m[j]*u + cy:T1 ;
62C        /* for all j result cy:T1 <= 2*2^64 - 3 (see note 2) */
63C     tmp[j-1] = T0;
64C   }
65C tmp[len-1] = T1 ;
66C tmp[len] = cy ; /* cy <= 1 (see note 2) */
67C for (i = 1; i < len; i++)
68C   {
69C     cy:T1:T0 = x[i]*y[0] + tmp[1]:tmp[0] ;
70C     u = (T0*invm) % 2^64 ;
71C     cy:T1 = (m[0]*u + cy:T1:T0) / 2^64 ; /* cy:T1 <= 3*2^64 - 4 (see note 3) */
72C     for (j = 1; j < len; j++)
73C       {
74C         cy:T1:T0 = x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 ;
75C         /* for all j < (len-1), result cy:T1 <= 3*2^64 - 3
76C            for j = (len-1), result cy:T1 <= 2*2^64 - 1  (see note 4) */
77C         tmp[j-1] = T0;
78C       }
79C     tmp[len-1] = T1 ;
80C     tmp[len] = cy ; /* cy <= 1 for all i (see note 4) */
81C   }
82C z[0 ... len-1] = tmp[0 ... len-1] ;
83C return (tmp[len]) ;
84C
85C notes:
86C
87C 1:  m[0]*u + T1:T0 <= 2*(2^64 - 1)^2 <= 2*2^128 - 4*2^64 + 2,
88C     so cy:T1 <= 2*2^64 - 4.
89C 2:  For j = 1, x[i]*y[j] + m[j]*u + cy:T1 <= 2*(2^64 - 1)^2 + 2*2^64 - 4
90C                 <= 2*2^128 - 2*2^64 - 2 = 1:(2^64-3):(2^64-2),
91C     so cy:T1 <= 2*2^64 - 3. For j > 1,
92C     x[i]*y[j] + m[j]*u + cy:T1 <= 2*2^128 - 2*2^64 - 1 = 1:(2^64-3):(2^64-1),
93C     so cy:T1 <= 2*2^64 - 3 = 1:(2^64-3) holds for all j.
94C 3:  m[0]*u + cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
95C     so cy:T1 <= 3*2^64 - 4 = 2:(2^64-4)
96C 4:  For j = 1, x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1
97C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 4) + (2^64-1)*2^64
98C                  <= 3*2^128 - 2*2^64 - 2 = 2:(2^64-3):(2^64-2),
99C     so cy:T1 <= 3*2^64 - 3. For j > 1,
100C     x[i]*y[j] + m[j]*u + (tmp[j+1] + cy):T1 <= 2:(2^64-3):(2^64-1),
101C     so cy:T1 <= 3*2^64 - 3 = 2:(2^64-3) holds for all j < len - 1.
102C     For j = len - 1, we know from note 2 that tmp(len) <= 1 for i = 0.
103C     Assume this is true for index i-1, Then
104C                x[i]*y[len-1] + m[len-1]*u + (tmp[len] + cy):T1
105C                  <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
106C                  <= 2*2^128 - 1 = 1:(2^64-1):(2^64-1),
107C     so cy:T1 <= 1:(2^64-1) and tmp[len] <= 1 for all i by induction.
108C
109C Register vars: T0 = r13, T1 = r14, CY = r10, XI = r12, U = r11
110C                YP = r5, MP = r6, TP = r1 (stack ptr)
111C
112
113C local variables: tmp[0 ... 12] array, having 12+1 8-byte words
114C The tmp array needs 12+1 entries, but tmp[12] is stored in
115C r15, so only 12 entries are used in the stack.
116
117
118	TEXT
119	.align	5	C powerPC 32 byte alignment
120.GSYM_PREFIX`'mulredc12:
121
122C ########################################################################
123C # i = 0 pass
124C #########################################################################
125
126C Pass for j = 0. We need to fetch x[i] from memory and compute the new u
127
128	ld      r12, 0(r4)		C XI = x[0]
129	ld      r0, 0(r5)		C y[0]
130	stdu    r13, -8(r1)		C save r13
131	mulld   r8, r0, r12		C x[0]*y[0] low half
132	stdu    r14, -8(r1)		C save r14
133	mulhdu  r9, r0, r12		C x[0]*y[0] high half
134	ld      r0, 0(r6)		C m[0]
135	mulld   r11, r7, r8		C U = T0*invm mod 2^64
136	stdu    r15, -8(r1)		C save r15
137	mulld   r13, r0, r11		C T0 = U*m[0] low
138	stdu    r16, -8(r1)		C save r16
139	li      r16, 0			C set r16 to zero for carry propagation
140	subi    r1, r1, 96		C set tmp stack space
141	mulhdu  r14, r0, r11		C T1 = U*m[0] high
142	ld      r0, 8(r5)		C y[1]
143	addc    r8, r8, r13		C
144	adde    r13, r9, r14		C T0 = initial tmp(0)
145	addze   r10, r16		C carry to CY
146	C CY:T1:T0 <= 2*(2^64-1)^2 <= 2^2*128 - 4*2^64 + 2, hence
147	C CY:T1 <= 2*2^64 - 4
148
149C Pass for j = 1
150
151	mulld   r8, r0, r12		C x[i]*y[j] low half
152	mulhdu  r9, r0, r12		C x[i]*y[j] high half
153	ld      r0, 8(r6)		C m[j]
154	addc    r13, r8, r13		C add low word to T0
155	adde    r14, r9, r10		C add high word with carry + CY to T1
156	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
157
158	mulld   r8, r0, r11		C U*m[j] low
159	mulhdu  r9, r0, r11		C U*m[j] high
160	addc    r8, r8, r13		C add T0 and low word
161	ld      r0, 16(r5)		C y[j+1]
162	adde    r13, r9, r14		C add high word with carry to T1
163	addze   r10, r16		C carry to CY
164	std     r8, 0(r1)		C store tmp[j-1]
165	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
166	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
167
168C Pass for j = 2
169
170	mulld   r8, r0, r12		C x[i]*y[j] low half
171	mulhdu  r9, r0, r12		C x[i]*y[j] high half
172	ld      r0, 16(r6)		C m[j]
173	addc    r13, r8, r13		C add low word to T0
174	adde    r14, r9, r10		C add high word with carry + CY to T1
175	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
176
177	mulld   r8, r0, r11		C U*m[j] low
178	mulhdu  r9, r0, r11		C U*m[j] high
179	addc    r8, r8, r13		C add T0 and low word
180	ld      r0, 24(r5)		C y[j+1]
181	adde    r13, r9, r14		C add high word with carry to T1
182	addze   r10, r16		C carry to CY
183	std     r8, 8(r1)		C store tmp[j-1]
184	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
185	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
186
187C Pass for j = 3
188
189	mulld   r8, r0, r12		C x[i]*y[j] low half
190	mulhdu  r9, r0, r12		C x[i]*y[j] high half
191	ld      r0, 24(r6)		C m[j]
192	addc    r13, r8, r13		C add low word to T0
193	adde    r14, r9, r10		C add high word with carry + CY to T1
194	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
195
196	mulld   r8, r0, r11		C U*m[j] low
197	mulhdu  r9, r0, r11		C U*m[j] high
198	addc    r8, r8, r13		C add T0 and low word
199	ld      r0, 32(r5)		C y[j+1]
200	adde    r13, r9, r14		C add high word with carry to T1
201	addze   r10, r16		C carry to CY
202	std     r8, 16(r1)		C store tmp[j-1]
203	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
204	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
205
206C Pass for j = 4
207
208	mulld   r8, r0, r12		C x[i]*y[j] low half
209	mulhdu  r9, r0, r12		C x[i]*y[j] high half
210	ld      r0, 32(r6)		C m[j]
211	addc    r13, r8, r13		C add low word to T0
212	adde    r14, r9, r10		C add high word with carry + CY to T1
213	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
214
215	mulld   r8, r0, r11		C U*m[j] low
216	mulhdu  r9, r0, r11		C U*m[j] high
217	addc    r8, r8, r13		C add T0 and low word
218	ld      r0, 40(r5)		C y[j+1]
219	adde    r13, r9, r14		C add high word with carry to T1
220	addze   r10, r16		C carry to CY
221	std     r8, 24(r1)		C store tmp[j-1]
222	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
223	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
224
225C Pass for j = 5
226
227	mulld   r8, r0, r12		C x[i]*y[j] low half
228	mulhdu  r9, r0, r12		C x[i]*y[j] high half
229	ld      r0, 40(r6)		C m[j]
230	addc    r13, r8, r13		C add low word to T0
231	adde    r14, r9, r10		C add high word with carry + CY to T1
232	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
233
234	mulld   r8, r0, r11		C U*m[j] low
235	mulhdu  r9, r0, r11		C U*m[j] high
236	addc    r8, r8, r13		C add T0 and low word
237	ld      r0, 48(r5)		C y[j+1]
238	adde    r13, r9, r14		C add high word with carry to T1
239	addze   r10, r16		C carry to CY
240	std     r8, 32(r1)		C store tmp[j-1]
241	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
242	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
243
244C Pass for j = 6
245
246	mulld   r8, r0, r12		C x[i]*y[j] low half
247	mulhdu  r9, r0, r12		C x[i]*y[j] high half
248	ld      r0, 48(r6)		C m[j]
249	addc    r13, r8, r13		C add low word to T0
250	adde    r14, r9, r10		C add high word with carry + CY to T1
251	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
252
253	mulld   r8, r0, r11		C U*m[j] low
254	mulhdu  r9, r0, r11		C U*m[j] high
255	addc    r8, r8, r13		C add T0 and low word
256	ld      r0, 56(r5)		C y[j+1]
257	adde    r13, r9, r14		C add high word with carry to T1
258	addze   r10, r16		C carry to CY
259	std     r8, 40(r1)		C store tmp[j-1]
260	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
261	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
262
263C Pass for j = 7
264
265	mulld   r8, r0, r12		C x[i]*y[j] low half
266	mulhdu  r9, r0, r12		C x[i]*y[j] high half
267	ld      r0, 56(r6)		C m[j]
268	addc    r13, r8, r13		C add low word to T0
269	adde    r14, r9, r10		C add high word with carry + CY to T1
270	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
271
272	mulld   r8, r0, r11		C U*m[j] low
273	mulhdu  r9, r0, r11		C U*m[j] high
274	addc    r8, r8, r13		C add T0 and low word
275	ld      r0, 64(r5)		C y[j+1]
276	adde    r13, r9, r14		C add high word with carry to T1
277	addze   r10, r16		C carry to CY
278	std     r8, 48(r1)		C store tmp[j-1]
279	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
280	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
281
282C Pass for j = 8
283
284	mulld   r8, r0, r12		C x[i]*y[j] low half
285	mulhdu  r9, r0, r12		C x[i]*y[j] high half
286	ld      r0, 64(r6)		C m[j]
287	addc    r13, r8, r13		C add low word to T0
288	adde    r14, r9, r10		C add high word with carry + CY to T1
289	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
290
291	mulld   r8, r0, r11		C U*m[j] low
292	mulhdu  r9, r0, r11		C U*m[j] high
293	addc    r8, r8, r13		C add T0 and low word
294	ld      r0, 72(r5)		C y[j+1]
295	adde    r13, r9, r14		C add high word with carry to T1
296	addze   r10, r16		C carry to CY
297	std     r8, 56(r1)		C store tmp[j-1]
298	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
299	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
300
301C Pass for j = 9
302
303	mulld   r8, r0, r12		C x[i]*y[j] low half
304	mulhdu  r9, r0, r12		C x[i]*y[j] high half
305	ld      r0, 72(r6)		C m[j]
306	addc    r13, r8, r13		C add low word to T0
307	adde    r14, r9, r10		C add high word with carry + CY to T1
308	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
309
310	mulld   r8, r0, r11		C U*m[j] low
311	mulhdu  r9, r0, r11		C U*m[j] high
312	addc    r8, r8, r13		C add T0 and low word
313	ld      r0, 80(r5)		C y[j+1]
314	adde    r13, r9, r14		C add high word with carry to T1
315	addze   r10, r16		C carry to CY
316	std     r8, 64(r1)		C store tmp[j-1]
317	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
318	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
319
320C Pass for j = 10
321
322	mulld   r8, r0, r12		C x[i]*y[j] low half
323	mulhdu  r9, r0, r12		C x[i]*y[j] high half
324	ld      r0, 80(r6)		C m[j]
325	addc    r13, r8, r13		C add low word to T0
326	adde    r14, r9, r10		C add high word with carry + CY to T1
327	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
328
329	mulld   r8, r0, r11		C U*m[j] low
330	mulhdu  r9, r0, r11		C U*m[j] high
331	addc    r8, r8, r13		C add T0 and low word
332	ld      r0, 88(r5)		C y[j+1]
333	adde    r13, r9, r14		C add high word with carry to T1
334	addze   r10, r16		C carry to CY
335	std     r8, 72(r1)		C store tmp[j-1]
336	C CY:T1:T0 <= 2^128 - 2 + 2^128 - 2*2^64 + 1 <=
337	C             2 * 2^128 - 2*2^64 - 1 ==> CY:T1 <= 2 * 2^64 - 3
338
339C Pass for j = 11. Don't fetch new data from y[j+1].
340
341	mulld   r8, r0, r12		C x[i]*y[j] low half
342	mulhdu  r9, r0, r12		C x[i]*y[j] high half
343	ld      r0, 88(r6)		C m[j]
344	addc    r13, r8, r13		C add low word to T0
345	adde    r14, r9, r10		C add high word with carry + CY to T1
346	C T1:T0 <= 2^128 - 2*2^64 + 1 + 2*2^64 - 3 <= 2^128 - 2, no carry!
347
348	mulld   r8, r0, r11		C U*m[j] low
349	mulhdu  r9, r0, r11		C U*m[j] high
350	addc    r8, r8, r13		C add T0 and low word
351	adde    r13, r9, r14		C add high word with carry to T1
352	std     r8, 80(r1)		C store tmp[len-2]
353	addze   r15, r16		C put carry in r15 (tmp[len] <= 1)
354	std     r13, 88(r1)		C store tmp[len-1]
355
356
357C #########################################################################
358C # i > 0 passes
359C #########################################################################
360
361
362	li      r9, 11			C outer loop count
363	mtctr   r9
364
3651:
366
367C Pass for j = 0. We need to fetch x[i], tmp[i] and tmp[i+1] from memory
368C and compute the new u
369
370	ldu     r12, 8(r4)		C x[i]
371	ld      r0, 0(r5)		C y[0]
372	ld      r13, 0(r1)		C tmp[0]
373	mulld   r8, r0, r12		C x[i]*y[0] low half
374	ld      r14, 8(r1)		C tmp[1]
375	mulhdu  r9, r0, r12		C x[i]*y[0] high half
376	addc    r13, r8, r13		C T0
377	ld      r0, 0(r6)		C m[0]
378	mulld   r11, r7, r13		C U = T0*invm mod 2^64
379	adde    r14, r9, r14		C T1
380	mulld   r8, r0, r11		C U*m[0] low
381	addze   r10, r16		C CY
382	mulhdu  r9, r0, r11		C U*m[0] high
383	ld      r0, 8(r5)		C y[1]
384	addc    r8, r8, r13		C result = 0
385	adde    r13, r9, r14		C T0, carry pending
386	C cy:T1:T0 <= 2*(2^64 - 1)^2 + 2^128 - 1 = 3*2^128 - 4*2^64 + 1,
387	C so cy:T1 <= 3*2^64 - 4
388
389C Pass for j = 1
390
391	ld      r14, 16(r1)		C tmp[j+1]
392	mulld   r8, r0, r12		C x[i]*y[j] low half
393	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
394	addze   r10, r16		C carry to CY
395	mulhdu  r9, r0, r12		C x[i]*y[j] high half
396	ld      r0, 8(r6)		C m[j]
397	addc    r13, r8, r13		C add low word to T0
398	mulld   r8, r0, r11		C U*m[j] low
399	adde    r14, r9, r14		C add high to T1
400	addze   r10, r10		C add carry to CY
401	mulhdu  r9, r0, r11		C U*m[j] high
402	addc    r8, r8, r13		C add T0 and low word
403	ld      r0, 16(r5)		C y[j+1]
404	adde    r13, r9, r14		C T1, carry pending
405	std     r8, 0(r1)		C store tmp[j-1]
406	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
407	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
408
409C Pass for j = 2
410
411	ld      r14, 24(r1)		C tmp[j+1]
412	mulld   r8, r0, r12		C x[i]*y[j] low half
413	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
414	addze   r10, r16		C carry to CY
415	mulhdu  r9, r0, r12		C x[i]*y[j] high half
416	ld      r0, 16(r6)		C m[j]
417	addc    r13, r8, r13		C add low word to T0
418	mulld   r8, r0, r11		C U*m[j] low
419	adde    r14, r9, r14		C add high to T1
420	addze   r10, r10		C add carry to CY
421	mulhdu  r9, r0, r11		C U*m[j] high
422	addc    r8, r8, r13		C add T0 and low word
423	ld      r0, 24(r5)		C y[j+1]
424	adde    r13, r9, r14		C T1, carry pending
425	std     r8, 8(r1)		C store tmp[j-1]
426	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
427	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
428
429C Pass for j = 3
430
431	ld      r14, 32(r1)		C tmp[j+1]
432	mulld   r8, r0, r12		C x[i]*y[j] low half
433	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
434	addze   r10, r16		C carry to CY
435	mulhdu  r9, r0, r12		C x[i]*y[j] high half
436	ld      r0, 24(r6)		C m[j]
437	addc    r13, r8, r13		C add low word to T0
438	mulld   r8, r0, r11		C U*m[j] low
439	adde    r14, r9, r14		C add high to T1
440	addze   r10, r10		C add carry to CY
441	mulhdu  r9, r0, r11		C U*m[j] high
442	addc    r8, r8, r13		C add T0 and low word
443	ld      r0, 32(r5)		C y[j+1]
444	adde    r13, r9, r14		C T1, carry pending
445	std     r8, 16(r1)		C store tmp[j-1]
446	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
447	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
448
449C Pass for j = 4
450
451	ld      r14, 40(r1)		C tmp[j+1]
452	mulld   r8, r0, r12		C x[i]*y[j] low half
453	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
454	addze   r10, r16		C carry to CY
455	mulhdu  r9, r0, r12		C x[i]*y[j] high half
456	ld      r0, 32(r6)		C m[j]
457	addc    r13, r8, r13		C add low word to T0
458	mulld   r8, r0, r11		C U*m[j] low
459	adde    r14, r9, r14		C add high to T1
460	addze   r10, r10		C add carry to CY
461	mulhdu  r9, r0, r11		C U*m[j] high
462	addc    r8, r8, r13		C add T0 and low word
463	ld      r0, 40(r5)		C y[j+1]
464	adde    r13, r9, r14		C T1, carry pending
465	std     r8, 24(r1)		C store tmp[j-1]
466	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
467	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
468
469C Pass for j = 5
470
471	ld      r14, 48(r1)		C tmp[j+1]
472	mulld   r8, r0, r12		C x[i]*y[j] low half
473	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
474	addze   r10, r16		C carry to CY
475	mulhdu  r9, r0, r12		C x[i]*y[j] high half
476	ld      r0, 40(r6)		C m[j]
477	addc    r13, r8, r13		C add low word to T0
478	mulld   r8, r0, r11		C U*m[j] low
479	adde    r14, r9, r14		C add high to T1
480	addze   r10, r10		C add carry to CY
481	mulhdu  r9, r0, r11		C U*m[j] high
482	addc    r8, r8, r13		C add T0 and low word
483	ld      r0, 48(r5)		C y[j+1]
484	adde    r13, r9, r14		C T1, carry pending
485	std     r8, 32(r1)		C store tmp[j-1]
486	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
487	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
488
489C Pass for j = 6
490
491	ld      r14, 56(r1)		C tmp[j+1]
492	mulld   r8, r0, r12		C x[i]*y[j] low half
493	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
494	addze   r10, r16		C carry to CY
495	mulhdu  r9, r0, r12		C x[i]*y[j] high half
496	ld      r0, 48(r6)		C m[j]
497	addc    r13, r8, r13		C add low word to T0
498	mulld   r8, r0, r11		C U*m[j] low
499	adde    r14, r9, r14		C add high to T1
500	addze   r10, r10		C add carry to CY
501	mulhdu  r9, r0, r11		C U*m[j] high
502	addc    r8, r8, r13		C add T0 and low word
503	ld      r0, 56(r5)		C y[j+1]
504	adde    r13, r9, r14		C T1, carry pending
505	std     r8, 40(r1)		C store tmp[j-1]
506	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
507	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
508
509C Pass for j = 7
510
511	ld      r14, 64(r1)		C tmp[j+1]
512	mulld   r8, r0, r12		C x[i]*y[j] low half
513	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
514	addze   r10, r16		C carry to CY
515	mulhdu  r9, r0, r12		C x[i]*y[j] high half
516	ld      r0, 56(r6)		C m[j]
517	addc    r13, r8, r13		C add low word to T0
518	mulld   r8, r0, r11		C U*m[j] low
519	adde    r14, r9, r14		C add high to T1
520	addze   r10, r10		C add carry to CY
521	mulhdu  r9, r0, r11		C U*m[j] high
522	addc    r8, r8, r13		C add T0 and low word
523	ld      r0, 64(r5)		C y[j+1]
524	adde    r13, r9, r14		C T1, carry pending
525	std     r8, 48(r1)		C store tmp[j-1]
526	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
527	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
528
529C Pass for j = 8
530
531	ld      r14, 72(r1)		C tmp[j+1]
532	mulld   r8, r0, r12		C x[i]*y[j] low half
533	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
534	addze   r10, r16		C carry to CY
535	mulhdu  r9, r0, r12		C x[i]*y[j] high half
536	ld      r0, 64(r6)		C m[j]
537	addc    r13, r8, r13		C add low word to T0
538	mulld   r8, r0, r11		C U*m[j] low
539	adde    r14, r9, r14		C add high to T1
540	addze   r10, r10		C add carry to CY
541	mulhdu  r9, r0, r11		C U*m[j] high
542	addc    r8, r8, r13		C add T0 and low word
543	ld      r0, 72(r5)		C y[j+1]
544	adde    r13, r9, r14		C T1, carry pending
545	std     r8, 56(r1)		C store tmp[j-1]
546	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
547	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
548
549C Pass for j = 9
550
551	ld      r14, 80(r1)		C tmp[j+1]
552	mulld   r8, r0, r12		C x[i]*y[j] low half
553	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
554	addze   r10, r16		C carry to CY
555	mulhdu  r9, r0, r12		C x[i]*y[j] high half
556	ld      r0, 72(r6)		C m[j]
557	addc    r13, r8, r13		C add low word to T0
558	mulld   r8, r0, r11		C U*m[j] low
559	adde    r14, r9, r14		C add high to T1
560	addze   r10, r10		C add carry to CY
561	mulhdu  r9, r0, r11		C U*m[j] high
562	addc    r8, r8, r13		C add T0 and low word
563	ld      r0, 80(r5)		C y[j+1]
564	adde    r13, r9, r14		C T1, carry pending
565	std     r8, 64(r1)		C store tmp[j-1]
566	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
567	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
568
569C Pass for j = 10
570
571	ld      r14, 88(r1)		C tmp[j+1]
572	mulld   r8, r0, r12		C x[i]*y[j] low half
573	adde    r14, r14, r10		C tmp[j+1] + CY + pending carry
574	addze   r10, r16		C carry to CY
575	mulhdu  r9, r0, r12		C x[i]*y[j] high half
576	ld      r0, 80(r6)		C m[j]
577	addc    r13, r8, r13		C add low word to T0
578	mulld   r8, r0, r11		C U*m[j] low
579	adde    r14, r9, r14		C add high to T1
580	addze   r10, r10		C add carry to CY
581	mulhdu  r9, r0, r11		C U*m[j] high
582	addc    r8, r8, r13		C add T0 and low word
583	ld      r0, 88(r5)		C y[j+1]
584	adde    r13, r9, r14		C T1, carry pending
585	std     r8, 72(r1)		C store tmp[j-1]
586	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + (2^64-1)*2^64
587	C          <= 3*2^128 - 2*2^64 - 1 ==> CY:T1 <= 3*2^64 - 3
588
589C Pass for j = 11. Don't fetch new data from y[j+1].
590
591	mulld   r8, r0, r12		C x[i]*y[j] low half
592	adde    r14, r15, r10		C T1 = tmp[len] + CY + pending carry
593	C since tmp[len] <= 1, T1 <= 3 and carry is zero
594	mulhdu  r9, r0, r12		C x[i]*y[j] high half
595	ld      r0, 88(r6)		C m[j]
596	addc    r13, r8, r13		C add low word to T0
597	mulld   r8, r0, r11		C U*m[j] low
598	adde    r14, r9, r14		C add high to T1
599	addze   r10, r16		C CY
600	mulhdu  r9, r0, r11		C U*m[j] high
601	addc    r8, r8, r13		C add T0 and low word
602	adde    r13, r9, r14		C T1, carry pending
603	std     r8, 80(r1)		C store tmp[len-2]
604	addze   r15, r10		C store tmp[len] <= 1
605	std     r13, 88(r1)		C store tmp[len-1]
606	C CY:T1:T0 <= 2*(2^64 - 1)^2 + (3*2^64 - 3) + 2^64
607	C          <= 2*2^128 - 1 ==> CY:T1 <= 2*2^64 - 1 = 1:(2^64-1)
608
609	bdnz 1b
610
611C Copy result from tmp memory to z
612
613	ld      r8, 0(r1)
614	ldu     r9, 8(r1)
615	std     r8, 0(r3)
616	stdu    r9, 8(r3)
617	ldu     r8, 8(r1)
618	ldu     r9, 8(r1)
619	stdu    r8, 8(r3)
620	stdu    r9, 8(r3)
621	ldu     r8, 8(r1)
622	ldu     r9, 8(r1)
623	stdu    r8, 8(r3)
624	stdu    r9, 8(r3)
625	ldu     r8, 8(r1)
626	ldu     r9, 8(r1)
627	stdu    r8, 8(r3)
628	stdu    r9, 8(r3)
629	ldu     r8, 8(r1)
630	ldu     r9, 8(r1)
631	stdu    r8, 8(r3)
632	stdu    r9, 8(r3)
633	ldu     r8, 8(r1)
634	ldu     r9, 8(r1)
635	stdu    r8, 8(r3)
636	stdu    r9, 8(r3)
637
638	mr      r3, r15         C return tmp(len)
639	ldu     r16, 8(r1)
640	ldu     r15, 8(r1)
641	ldu     r14, 8(r1)
642	ldu     r13, 8(r1)
643	addi    r1, r1, 8
644	blr
645
646	.size	.GSYM_PREFIX`'mulredc12, .-.GSYM_PREFIX`'mulredc12
647
648