1dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
2
3dnl  Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 14.2 cycles/crossproduct (approx)
35
36
37C void mpn_mul_basecase (mp_ptr wp,
38C                        mp_srcptr xp, mp_size_t xsize,
39C                        mp_srcptr yp, mp_size_t ysize);
40
41defframe(PARAM_YSIZE, 20)
42defframe(PARAM_YP,    16)
43defframe(PARAM_XSIZE, 12)
44defframe(PARAM_XP,    8)
45defframe(PARAM_WP,    4)
46
47defframe(VAR_COUNTER, -4)
48
49	TEXT
50	ALIGN(8)
51PROLOGUE(mpn_mul_basecase)
52
53	pushl	%eax			C dummy push for allocating stack slot
54	pushl	%esi
55	pushl	%ebp
56	pushl	%edi
57deflit(`FRAME',16)
58
59	movl	PARAM_XP,%esi
60	movl	PARAM_WP,%edi
61	movl	PARAM_YP,%ebp
62
63	movl	(%esi),%eax		C load xp[0]
64	mull	(%ebp)			C multiply by yp[0]
65	movl	%eax,(%edi)		C store to wp[0]
66	movl	PARAM_XSIZE,%ecx	C xsize
67	decl	%ecx			C If xsize = 1, ysize = 1 too
68	jz	L(done)
69
70	movl	PARAM_XSIZE,%eax
71	pushl	%ebx
72FRAME_pushl()
73	movl	%edx,%ebx
74	leal	(%esi,%eax,4),%esi	C make xp point at end
75	leal	(%edi,%eax,4),%edi	C offset wp by xsize
76	negl	%ecx			C negate j size/index for inner loop
77	xorl	%eax,%eax		C clear carry
78
79	ALIGN(8)
80L(oop1):	adcl	$0,%ebx
81	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
82	mull	(%ebp)
83	addl	%ebx,%eax
84	movl	%eax,(%edi,%ecx,4)
85	incl	%ecx
86	movl	%edx,%ebx
87	jnz	L(oop1)
88
89	adcl	$0,%ebx
90	movl	PARAM_YSIZE,%eax
91	movl	%ebx,(%edi)		C most significant limb of product
92	addl	$4,%edi			C increment wp
93	decl	%eax
94	jz	L(skip)
95	movl	%eax,VAR_COUNTER	C set index i to ysize
96
97L(outer):
98	addl	$4,%ebp			C make ebp point to next y limb
99	movl	PARAM_XSIZE,%ecx
100	negl	%ecx
101	xorl	%ebx,%ebx
102
103	C code at 0x61 here, close enough to aligned
104L(oop2):
105	adcl	$0,%ebx
106	movl	(%esi,%ecx,4),%eax
107	mull	(%ebp)
108	addl	%ebx,%eax
109	movl	(%edi,%ecx,4),%ebx
110	adcl	$0,%edx
111	addl	%eax,%ebx
112	movl	%ebx,(%edi,%ecx,4)
113	incl	%ecx
114	movl	%edx,%ebx
115	jnz	L(oop2)
116
117	adcl	$0,%ebx
118
119	movl	%ebx,(%edi)
120	addl	$4,%edi
121	movl	VAR_COUNTER,%eax
122	decl	%eax
123	movl	%eax,VAR_COUNTER
124	jnz	L(outer)
125
126L(skip):
127	popl	%ebx
128	popl	%edi
129	popl	%ebp
130	popl	%esi
131	addl	$4,%esp
132	ret
133
134L(done):
135	movl	%edx,4(%edi)	C store to wp[1]
136	popl	%edi
137	popl	%ebp
138	popl	%esi
139	popl	%eax		C dummy pop for deallocating stack slot
140	ret
141
142EPILOGUE()
143