1dnl  IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2000-2005, 2007 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C         cycles/limb
37C Itanium:    3.0
38C Itanium 2:  2.0
39
40C TODO
41C  * Further optimize feed-in and wind-down code, both for speed and code size.
42C  * Handle low limb input and results specially, using a common stf8 in the
43C    epilogue.
44C  * Use 1 c/l carry propagation scheme in wind-down code.
45C  * Use extra pointer registers for `up' and rp to speed up feed-in loads.
46C  * Work out final differences with mul_1.asm.  That function is 300 bytes
47C    smaller than this due to better loop scheduling and thus simpler feed-in
48C    code.
49
50C INPUT PARAMETERS
51define(`rp', `r32')
52define(`up', `r33')
53define(`n', `r34')
54define(`vl', `r35')
55
56ASM_START()
57PROLOGUE(mpn_addmul_1)
58	.prologue
59	.save	ar.lc, r2
60	.body
61
62ifdef(`HAVE_ABI_32',
63`	addp4		rp = 0, rp		C M I
64	addp4		up = 0, up		C M I
65	zxt4		n = n			C I
66	;;
67')
68{.mmi
69	adds		r15 = -1, n		C M I
70	mov		r20 = rp		C M I
71	mov.i		r2 = ar.lc		C I0
72}
73{.mmi
74	ldf8		f7 = [up], 8		C M
75	ldf8		f8 = [rp], 8		C M
76	and		r14 = 3, n		C M I
77	;;
78}
79{.mmi
80	setf.sig	f6 = vl			C M2 M3
81	cmp.eq		p10, p0 = 0, r14	C M I
82	shr.u		r31 = r15, 2		C I0
83}
84{.mmi
85	cmp.eq		p11, p0 = 2, r14	C M I
86	cmp.eq		p12, p0 = 3, r14	C M I
87	nop.i		0			C I
88	;;
89}
90{.mii
91	cmp.ne		p6, p7 = r0, r0		C M I
92	mov.i		ar.lc = r31		C I0
93	cmp.ne		p8, p9 = r0, r0		C M I
94}
95{.bbb
96  (p10)	br.dptk		.Lb00			C B
97  (p11)	br.dptk		.Lb10			C B
98  (p12)	br.dptk		.Lb11			C B
99	;;
100}
101
102.Lb01:	br.cloop.dptk	.grt1			C B
103
104	xma.l		f39 = f7, f6, f8	C F
105	xma.hu		f43 = f7, f6, f8	C F
106	;;
107	getf.sig	r8 = f43		C M2
108	stf8		[r20] = f39		C M2 M3
109	mov.i		ar.lc = r2		C I0
110	br.ret.sptk.many b0			C B
111
112.grt1:
113	ldf8		f32 = [up], 8
114	ldf8		f44 = [rp], 8
115	;;
116	ldf8		f33 = [up], 8
117	ldf8		f45 = [rp], 8
118	;;
119	ldf8		f34 = [up], 8
120	xma.l		f39 = f7, f6, f8
121	ldf8		f46 = [rp], 8
122	xma.hu		f43 = f7, f6, f8
123	;;
124	ldf8		f35 = [up], 8
125	ldf8		f47 = [rp], 8
126	br.cloop.dptk	.grt5
127
128	xma.l		f36 = f32, f6, f44
129	xma.hu		f40 = f32, f6, f44
130	;;
131	stf8		[r20] = f39, 8
132	xma.l		f37 = f33, f6, f45
133	xma.hu		f41 = f33, f6, f45
134	;;
135	getf.sig	r31 = f43
136	getf.sig	r24 = f36
137	xma.l		f38 = f34, f6, f46
138	xma.hu		f42 = f34, f6, f46
139	;;
140	getf.sig	r28 = f40
141	getf.sig	r25 = f37
142	xma.l		f39 = f35, f6, f47
143	xma.hu		f43 = f35, f6, f47
144	;;
145	getf.sig	r29 = f41
146	getf.sig	r26 = f38
147	br		.Lcj5
148
149.grt5:
150	mov		r30 = 0
151	xma.l		f36 = f32, f6, f44
152	xma.hu		f40 = f32, f6, f44
153	;;
154	ldf8		f32 = [up], 8
155	xma.l		f37 = f33, f6, f45
156	ldf8		f44 = [rp], 8
157	xma.hu		f41 = f33, f6, f45
158	;;
159	ldf8		f33 = [up], 8
160	getf.sig	r27 = f39
161	;;
162	getf.sig	r31 = f43
163	xma.l		f38 = f34, f6, f46
164	ldf8		f45 = [rp], 8
165	xma.hu		f42 = f34, f6, f46
166	;;
167	ldf8		f34 = [up], 8
168	getf.sig	r24 = f36
169	;;
170	getf.sig	r28 = f40
171	xma.l		f39 = f35, f6, f47
172	ldf8		f46 = [rp], 8
173	xma.hu		f43 = f35, f6, f47
174	;;
175	ldf8		f35 = [up], 8
176	getf.sig	r25 = f37
177	br.cloop.dptk	.Loop
178	br		.Le0
179
180
181.Lb10:	ldf8		f35 = [up], 8
182	ldf8		f47 = [rp], 8
183	br.cloop.dptk	.grt2
184
185	xma.l		f38 = f7, f6, f8
186	xma.hu		f42 = f7, f6, f8
187	;;
188	xma.l		f39 = f35, f6, f47
189	xma.hu		f43 = f35, f6, f47
190	;;
191	getf.sig	r30 = f42
192	stf8		[r20] = f38, 8
193	getf.sig	r27 = f39
194	getf.sig	r8 = f43
195	br		.Lcj2
196
197.grt2:
198	ldf8		f32 = [up], 8
199	ldf8		f44 = [rp], 8
200	;;
201	ldf8		f33 = [up], 8
202	xma.l		f38 = f7, f6, f8
203	ldf8		f45 = [rp], 8
204	xma.hu		f42 = f7, f6, f8
205	;;
206	ldf8		f34 = [up], 8
207	xma.l		f39 = f35, f6, f47
208	ldf8		f46 = [rp], 8
209	xma.hu		f43 = f35, f6, f47
210	;;
211	ldf8		f35 = [up], 8
212	ldf8		f47 = [rp], 8
213	br.cloop.dptk	.grt6
214
215	stf8		[r20] = f38, 8
216	xma.l		f36 = f32, f6, f44
217	xma.hu		f40 = f32, f6, f44
218	;;
219	getf.sig	r30 = f42
220	getf.sig	r27 = f39
221	xma.l		f37 = f33, f6, f45
222	xma.hu		f41 = f33, f6, f45
223	;;
224	getf.sig	r31 = f43
225	getf.sig	r24 = f36
226	xma.l		f38 = f34, f6, f46
227	xma.hu		f42 = f34, f6, f46
228	;;
229	getf.sig	r28 = f40
230	getf.sig	r25 = f37
231	xma.l		f39 = f35, f6, f47
232	xma.hu		f43 = f35, f6, f47
233	br		.Lcj6
234
235.grt6:
236	mov		r29 = 0
237	xma.l		f36 = f32, f6, f44
238	xma.hu		f40 = f32, f6, f44
239	;;
240	ldf8		f32 = [up], 8
241	getf.sig	r26 = f38
242	;;
243	getf.sig	r30 = f42
244	xma.l		f37 = f33, f6, f45
245	ldf8		f44 = [rp], 8
246	xma.hu		f41 = f33, f6, f45
247	;;
248	ldf8		f33 = [up], 8
249	getf.sig	r27 = f39
250	;;
251	getf.sig	r31 = f43
252	xma.l		f38 = f34, f6, f46
253	ldf8		f45 = [rp], 8
254	xma.hu		f42 = f34, f6, f46
255	;;
256	ldf8		f34 = [up], 8
257	getf.sig	r24 = f36
258	br		.LL10
259
260
261.Lb11:	ldf8		f34 = [up], 8
262	ldf8		f46 = [rp], 8
263	;;
264	ldf8		f35 = [up], 8
265	ldf8		f47 = [rp], 8
266	br.cloop.dptk	.grt3
267	;;
268
269	xma.l		f37 = f7, f6, f8
270	xma.hu		f41 = f7, f6, f8
271	xma.l		f38 = f34, f6, f46
272	xma.hu		f42 = f34, f6, f46
273	xma.l		f39 = f35, f6, f47
274	xma.hu		f43 = f35, f6, f47
275	;;
276	getf.sig	r29 = f41
277	stf8		[r20] = f37, 8
278	getf.sig	r26 = f38
279	getf.sig	r30 = f42
280	getf.sig	r27 = f39
281	getf.sig	r8 = f43
282	br		.Lcj3
283
284.grt3:
285	ldf8		f32 = [up], 8
286	xma.l		f37 = f7, f6, f8
287	ldf8		f44 = [rp], 8
288	xma.hu		f41 = f7, f6, f8
289	;;
290	ldf8		f33 = [up], 8
291	xma.l		f38 = f34, f6, f46
292	ldf8		f45 = [rp], 8
293	xma.hu		f42 = f34, f6, f46
294	;;
295	ldf8		f34 = [up], 8
296	xma.l		f39 = f35, f6, f47
297	ldf8		f46 = [rp], 8
298	xma.hu		f43 = f35, f6, f47
299	;;
300	ldf8		f35 = [up], 8
301	getf.sig	r25 = f37		C FIXME
302	ldf8		f47 = [rp], 8
303	br.cloop.dptk	.grt7
304
305	getf.sig	r29 = f41
306	stf8		[r20] = f37, 8		C FIXME
307	xma.l		f36 = f32, f6, f44
308	getf.sig	r26 = f38
309	xma.hu		f40 = f32, f6, f44
310	;;
311	getf.sig	r30 = f42
312	xma.l		f37 = f33, f6, f45
313	getf.sig	r27 = f39
314	xma.hu		f41 = f33, f6, f45
315	;;
316	getf.sig	r31 = f43
317	xma.l		f38 = f34, f6, f46
318	getf.sig	r24 = f36
319	xma.hu		f42 = f34, f6, f46
320	br		.Lcj7
321
322.grt7:
323	getf.sig	r29 = f41
324	xma.l		f36 = f32, f6, f44
325	mov		r28 = 0
326	xma.hu		f40 = f32, f6, f44
327	;;
328	ldf8		f32 = [up], 8
329	getf.sig	r26 = f38
330	;;
331	getf.sig	r30 = f42
332	xma.l		f37 = f33, f6, f45
333	ldf8		f44 = [rp], 8
334	xma.hu		f41 = f33, f6, f45
335	;;
336	ldf8		f33 = [up], 8
337	getf.sig	r27 = f39
338	br		.LL11
339
340
341.Lb00:	ldf8		f33 = [up], 8
342	ldf8		f45 = [rp], 8
343	;;
344	ldf8		f34 = [up], 8
345	ldf8		f46 = [rp], 8
346	;;
347	ldf8		f35 = [up], 8
348	xma.l		f36 = f7, f6, f8
349	ldf8		f47 = [rp], 8
350	xma.hu		f40 = f7, f6, f8
351	br.cloop.dptk	.grt4
352
353	xma.l		f37 = f33, f6, f45
354	xma.hu		f41 = f33, f6, f45
355	xma.l		f38 = f34, f6, f46
356	xma.hu		f42 = f34, f6, f46
357	;;
358	getf.sig	r28 = f40
359	stf8		[r20] = f36, 8
360	xma.l		f39 = f35, f6, f47
361	getf.sig	r25 = f37
362	xma.hu		f43 = f35, f6, f47
363	;;
364	getf.sig	r29 = f41
365	getf.sig	r26 = f38
366	getf.sig	r30 = f42
367	getf.sig	r27 = f39
368	br		.Lcj4
369
370.grt4:
371	ldf8		f32 = [up], 8
372	xma.l		f37 = f33, f6, f45
373	ldf8		f44 = [rp], 8
374	xma.hu		f41 = f33, f6, f45
375	;;
376	ldf8		f33 = [up], 8
377	xma.l		f38 = f34, f6, f46
378	ldf8		f45 = [rp], 8
379	xma.hu		f42 = f34, f6, f46
380	;;
381	ldf8		f34 = [up], 8
382	getf.sig	r24 = f36		C FIXME
383	xma.l		f39 = f35, f6, f47
384	ldf8		f46 = [rp], 8
385	getf.sig	r28 = f40
386	xma.hu		f43 = f35, f6, f47
387	;;
388	ldf8		f35 = [up], 8
389	getf.sig	r25 = f37
390	ldf8		f47 = [rp], 8
391	br.cloop.dptk	.grt8
392
393	getf.sig	r29 = f41
394	stf8		[r20] = f36, 8		C FIXME
395	xma.l		f36 = f32, f6, f44
396	getf.sig	r26 = f38
397	getf.sig	r30 = f42
398	xma.hu		f40 = f32, f6, f44
399	;;
400	xma.l		f37 = f33, f6, f45
401	getf.sig	r27 = f39
402	xma.hu		f41 = f33, f6, f45
403	br		.Lcj8
404
405.grt8:
406	getf.sig	r29 = f41
407	xma.l		f36 = f32, f6, f44
408	mov		r31 = 0
409	xma.hu		f40 = f32, f6, f44
410	;;
411	ldf8		f32 = [up], 8
412	getf.sig	r26 = f38
413	br		.LL00
414
415
416C *** MAIN LOOP START ***
417	ALIGN(32)				C insn	fed	cycle #
418.Loop:
419	.pred.rel "mutex", p6, p7		C num	by	i1 i2
420	getf.sig	r29 = f41		C 00	16	0   0
421	xma.l		f36 = f32, f6, f44	C 01	06,15	0   0
422   (p6)	add		r14 = r30, r27, 1	C 02		0   0
423	ldf8		f47 = [rp], 8		C 03		0   0
424	xma.hu		f40 = f32, f6, f44	C 04	06,15	0   0
425   (p7)	add		r14 = r30, r27		C 05		0   0
426	;;
427	.pred.rel "mutex", p6, p7
428	ldf8		f32 = [up], 8		C 06		1   1
429   (p6)	cmp.leu		p8, p9 = r14, r27	C 07		1   1
430   (p7)	cmp.ltu		p8, p9 = r14, r27	C 08		1   1
431	getf.sig	r26 = f38		C 09	25	2   1
432	st8		[r20] = r14, 8		C 10		2   1
433	nop.b		0			C 11		2   1
434	;;
435.LL00:
436	.pred.rel "mutex", p8, p9
437	getf.sig	r30 = f42		C 12	28	3   2
438	xma.l		f37 = f33, f6, f45	C 13	18,27	3   2
439   (p8)	add		r16 = r31, r24, 1	C 14		3   2
440	ldf8		f44 = [rp], 8		C 15		3   2
441	xma.hu		f41 = f33, f6, f45	C 16	18,27	3   2
442   (p9)	add		r16 = r31, r24		C 17		3   2
443	;;
444	.pred.rel "mutex", p8, p9
445	ldf8		f33 = [up], 8		C 18		4   3
446   (p8)	cmp.leu		p6, p7 = r16, r24	C 19		4   3
447   (p9)	cmp.ltu		p6, p7 = r16, r24	C 20		4   3
448	getf.sig	r27 = f39		C 21	37	5   3
449	st8		[r20] = r16, 8		C 22		5   3
450	nop.b		0			C 23		5   3
451	;;
452.LL11:
453	.pred.rel "mutex", p6, p7
454	getf.sig	r31 = f43		C 24	40	6   4
455	xma.l		f38 = f34, f6, f46	C 25	30,39	6   4
456   (p6)	add		r14 = r28, r25, 1	C 26		6   4
457	ldf8		f45 = [rp], 8		C 27		6   4
458	xma.hu		f42 = f34, f6, f46	C 28	30,39	6   4
459   (p7)	add		r14 = r28, r25		C 29		6   4
460	;;
461	.pred.rel "mutex", p6, p7
462	ldf8		f34 = [up], 8		C 30		7   5
463   (p6)	cmp.leu		p8, p9 = r14, r25	C 31		7   5
464   (p7)	cmp.ltu		p8, p9 = r14, r25	C 32		7   5
465	getf.sig	r24 = f36		C 33	01	8   5
466	st8		[r20] = r14, 8		C 34		8   5
467	nop.b		0			C 35		8   5
468	;;
469.LL10:
470	.pred.rel "mutex", p8, p9
471	getf.sig	r28 = f40		C 36	04	9   6
472	xma.l		f39 = f35, f6, f47	C 37	42,03	9   6
473   (p8)	add		r16 = r29, r26, 1	C 38		9   6
474	ldf8		f46 = [rp], 8		C 39		9   6
475	xma.hu		f43 = f35, f6, f47	C 40	42,03	9   6
476   (p9)	add		r16 = r29, r26		C 41		9   6
477	;;
478	.pred.rel "mutex", p8, p9
479	ldf8		f35 = [up], 8		C 42	       10   7
480   (p8)	cmp.leu		p6, p7 = r16, r26	C 43	       10   7
481   (p9)	cmp.ltu		p6, p7 = r16, r26	C 44	       10   7
482	getf.sig	r25 = f37		C 45	13     11   7
483	st8		[r20] = r16, 8		C 46	       11   7
484	br.cloop.dptk	.Loop			C 47	       11   7
485C *** MAIN LOOP END ***
486	;;
487.Le0:
488	.pred.rel "mutex", p6, p7
489	getf.sig	r29 = f41		C
490	xma.l		f36 = f32, f6, f44	C
491   (p6)	add		r14 = r30, r27, 1	C
492	ldf8		f47 = [rp], 8		C
493	xma.hu		f40 = f32, f6, f44	C
494   (p7)	add		r14 = r30, r27		C
495	;;
496	.pred.rel "mutex", p6, p7
497   (p6)	cmp.leu		p8, p9 = r14, r27	C
498   (p7)	cmp.ltu		p8, p9 = r14, r27	C
499	getf.sig	r26 = f38		C
500	st8		[r20] = r14, 8		C
501	;;
502	.pred.rel "mutex", p8, p9
503	getf.sig	r30 = f42		C
504	xma.l		f37 = f33, f6, f45	C
505   (p8)	add		r16 = r31, r24, 1	C
506	xma.hu		f41 = f33, f6, f45	C
507   (p9)	add		r16 = r31, r24		C
508	;;
509	.pred.rel "mutex", p8, p9
510   (p8)	cmp.leu		p6, p7 = r16, r24	C
511   (p9)	cmp.ltu		p6, p7 = r16, r24	C
512	getf.sig	r27 = f39		C
513	st8		[r20] = r16, 8		C
514	;;
515.Lcj8:
516	.pred.rel "mutex", p6, p7
517	getf.sig	r31 = f43		C
518	xma.l		f38 = f34, f6, f46	C
519   (p6)	add		r14 = r28, r25, 1	C
520	xma.hu		f42 = f34, f6, f46	C
521   (p7)	add		r14 = r28, r25		C
522	;;
523	.pred.rel "mutex", p6, p7
524   (p6)	cmp.leu		p8, p9 = r14, r25	C
525   (p7)	cmp.ltu		p8, p9 = r14, r25	C
526	getf.sig	r24 = f36		C
527	st8		[r20] = r14, 8		C
528	;;
529.Lcj7:
530	.pred.rel "mutex", p8, p9
531	getf.sig	r28 = f40		C
532	xma.l		f39 = f35, f6, f47	C
533   (p8)	add		r16 = r29, r26, 1	C
534	xma.hu		f43 = f35, f6, f47	C
535   (p9)	add		r16 = r29, r26		C
536	;;
537	.pred.rel "mutex", p8, p9
538   (p8)	cmp.leu		p6, p7 = r16, r26	C
539   (p9)	cmp.ltu		p6, p7 = r16, r26	C
540	getf.sig	r25 = f37		C
541	st8		[r20] = r16, 8		C
542	;;
543.Lcj6:
544	.pred.rel "mutex", p6, p7
545	getf.sig	r29 = f41		C
546   (p6)	add		r14 = r30, r27, 1	C
547   (p7)	add		r14 = r30, r27		C
548	;;
549	.pred.rel "mutex", p6, p7
550   (p6)	cmp.leu		p8, p9 = r14, r27	C
551   (p7)	cmp.ltu		p8, p9 = r14, r27	C
552	getf.sig	r26 = f38		C
553	st8		[r20] = r14, 8		C
554	;;
555.Lcj5:
556	.pred.rel "mutex", p8, p9
557	getf.sig	r30 = f42		C
558   (p8)	add		r16 = r31, r24, 1	C
559   (p9)	add		r16 = r31, r24		C
560	;;
561	.pred.rel "mutex", p8, p9
562   (p8)	cmp.leu		p6, p7 = r16, r24	C
563   (p9)	cmp.ltu		p6, p7 = r16, r24	C
564	getf.sig	r27 = f39		C
565	st8		[r20] = r16, 8		C
566	;;
567.Lcj4:
568	.pred.rel "mutex", p6, p7
569	getf.sig	r8 = f43		C
570   (p6)	add		r14 = r28, r25, 1	C
571   (p7)	add		r14 = r28, r25		C
572	;;
573	.pred.rel "mutex", p6, p7
574	st8		[r20] = r14, 8		C
575   (p6)	cmp.leu		p8, p9 = r14, r25	C
576   (p7)	cmp.ltu		p8, p9 = r14, r25	C
577	;;
578.Lcj3:
579	.pred.rel "mutex", p8, p9
580   (p8)	add		r16 = r29, r26, 1	C
581   (p9)	add		r16 = r29, r26		C
582	;;
583	.pred.rel "mutex", p8, p9
584	st8		[r20] = r16, 8		C
585   (p8)	cmp.leu		p6, p7 = r16, r26	C
586   (p9)	cmp.ltu		p6, p7 = r16, r26	C
587	;;
588.Lcj2:
589	.pred.rel "mutex", p6, p7
590   (p6)	add		r14 = r30, r27, 1	C
591   (p7)	add		r14 = r30, r27		C
592	;;
593	.pred.rel "mutex", p6, p7
594	st8		[r20] = r14		C
595   (p6)	cmp.leu		p8, p9 = r14, r27	C
596   (p7)	cmp.ltu		p8, p9 = r14, r27	C
597	;;
598   (p8)	add		r8 = 1, r8		C M I
599	mov.i		ar.lc = r2		C I0
600	br.ret.sptk.many b0			C B
601EPILOGUE()
602ASM_END()
603