1dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
3
4dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 2.1 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public
19dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
20dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
21dnl  Fifth Floor, Boston, MA 02110-1301, USA.
22
23include(`../config.m4')
24
25NAILS_SUPPORT(0-31)
26
27
28C         alignment dst/src1/src2, A=0mod8, N=4mod8
29C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
30C
31C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
32C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
33C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
34C
35C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
36C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
37C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
38
39
40dnl  M4_p and M4_i are the MMX and integer instructions
41dnl  M4_*_neg_dst means whether to negate the final result before writing
42dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
43
44define(`OPERATION_nand_n',1)
45
46define(M4_choose_op,
47m4_assert_numargs(7)
48`ifdef(`OPERATION_$1',`
49define(`M4_function',  `mpn_$1')
50define(`M4_operation', `$1')
51define(`M4_p',         `$2')
52define(`M4_p_neg_dst', `$3')
53define(`M4_p_neg_src2',`$4')
54define(`M4_i',         `$5')
55define(`M4_i_neg_dst', `$6')
56define(`M4_i_neg_src2',`$7')
57')')
58
59dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
60dnl  style (the two are equivalent for xor).
61dnl
62dnl  pandn can't be used with nails.
63
64M4_choose_op( and_n,  pand,0,0,  andl,0,0)
65ifelse(GMP_NAIL_BITS,0,
66`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
67`M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
68M4_choose_op( nand_n, pand,1,0,  andl,1,0)
69M4_choose_op( ior_n,  por,0,0,   orl,0,0)
70M4_choose_op( iorn_n, por,0,1,   orl,0,1)
71M4_choose_op( nior_n, por,1,0,   orl,1,0)
72M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
73M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
74
75ifdef(`M4_function',,
76`m4_error(`Unrecognised or undefined OPERATION symbol
77')')
78
79C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
80C                   mp_size_t size);
81C
82C Do src1,size M4_operation src2,size, storing the result in dst,size.
83C
84C Unaligned movq loads and stores are a bit slower than aligned ones.  The
85C test at the start of the routine checks the alignment of src1 and if
86C necessary processes one limb separately at the low end to make it aligned.
87C
88C The raw speeds without this alignment switch are as follows.
89C
90C           alignment dst/src1/src2, A=0mod8, N=4mod8
91C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
92C
93C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
94C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
95C K6                 2.0    2.25                2.35   2.28   nand,nior
96C
97C
98C Future:
99C
100C K6 can do one 64-bit load per cycle so each of these routines should be
101C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
102C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
103C The others are 4 instructions per 2 limbs, and so can only approach 1.0
104C because there's nowhere to hide some loop control.
105
106defframe(PARAM_SIZE,16)
107defframe(PARAM_SRC2,12)
108defframe(PARAM_SRC1,8)
109defframe(PARAM_DST, 4)
110deflit(`FRAME',0)
111
112	TEXT
113	ALIGN(32)
114PROLOGUE(M4_function)
115			movl	PARAM_SIZE, %ecx
116			pushl	%ebx		FRAME_pushl()
117
118			movl	PARAM_SRC1, %eax
119
120			movl	PARAM_SRC2, %ebx
121			cmpl	$1, %ecx
122
123			movl	PARAM_DST, %edx
124			ja	L(two_or_more)
125
126
127			movl	(%ebx), %ecx
128			popl	%ebx
129ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
130			M4_i	(%eax), %ecx
131ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
132			movl	%ecx, (%edx)
133
134			ret
135
136
137L(two_or_more):
138			C eax	src1
139			C ebx	src2
140			C ecx	size
141			C edx	dst
142			C esi
143			C edi
144			C ebp
145
146			pushl	%esi		FRAME_pushl()
147			testl	$4, %eax
148			jz	L(alignment_ok)
149
150			movl	(%ebx), %esi
151			addl	$4, %ebx
152ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
153			M4_i	(%eax), %esi
154			addl	$4, %eax
155ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
156			movl	%esi, (%edx)
157			addl	$4, %edx
158			decl	%ecx
159
160L(alignment_ok):
161			movl	%ecx, %esi
162			shrl	%ecx
163			jnz	L(still_two_or_more)
164
165			movl	(%ebx), %ecx
166			popl	%esi
167ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
168			M4_i	(%eax), %ecx
169ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
170			popl	%ebx
171			movl	%ecx, (%edx)
172			ret
173
174
175L(still_two_or_more):
176ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
177			pcmpeqd	%mm7, %mm7		C all ones
178ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
179')
180
181			ALIGN(16)
182L(top):
183			C eax	src1
184			C ebx	src2
185			C ecx	counter
186			C edx	dst
187			C esi
188			C edi
189			C ebp
190			C
191			C carry bit is low of size
192
193			movq	-8(%ebx,%ecx,8), %mm0
194ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
195			M4_p	-8(%eax,%ecx,8), %mm0
196ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
197			movq	%mm0, -8(%edx,%ecx,8)
198
199			loop	L(top)
200
201
202			jnc	L(no_extra)
203
204			movl	-4(%ebx,%esi,4), %ebx
205ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
206			M4_i	-4(%eax,%esi,4), %ebx
207ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
208			movl	%ebx, -4(%edx,%esi,4)
209L(no_extra):
210
211			popl	%esi
212			popl	%ebx
213			emms
214			ret
215
216EPILOGUE()
217