1; PROLOGUE(mpn_andn_n)
2
3;  mpn_andn_n
4
5;  Copyright 2009 Jason Moxham
6
7;  This file is part of the MPIR Library.
8
9;  The MPIR Library is free software; you can redistribute it and/or modify
10;  it under the terms of the GNU Lesser General Public License as published
11;  by the Free Software Foundation; either verdxon 2.1 of the License, or (at
12;  your option) any later verdxon.
13
14;  The MPIR Library is distributed in the hope that it will be useful, but
15;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17;  License for more details.
18
19;  You should have received a copy of the GNU Lesser General Public License
20;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
21;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22;  Boston, MA 02110-1301, USA.
23
24;  void mpn_andn_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)
25;                     rcx        rdx         r8         r9
26;                     rdi        rsi        rdx        rcx
27
28%include 'yasm_mac.inc'
29
30    CPU  nehalem
31    BITS 64
32
33	LEAF_PROC mpn_andn_n
34	movsxd	r9, r9d
35	mov     r10, 3
36	lea     rdx, [rdx+r9*8-24]
37	lea     r8, [r8+r9*8-24]
38	lea     rcx, [rcx+r9*8-24]
39	sub     r10, r9
40	jnc     .2
41
42	xalign  16
43.1:	movdqu  xmm0, [r8+r10*8]
44	movdqu  xmm1, [r8+r10*8+16]
45	movdqu  xmm3, [rdx+r10*8+16]
46	movdqu  xmm2, [rdx+r10*8]
47	pandn   xmm0, xmm2
48	movdqu  [rcx+r10*8], xmm0
49	pandn   xmm1, xmm3
50	add     r10, 4
51	movdqu  [rcx+r10*8+16-32], xmm1
52	jnc     .1
53.2:	cmp     r10, 2
54	ja      .4
55	je      .6
56	jp      .5
57.3:	movdqu  xmm0, [r8+r10*8]
58	mov     rax, [r8+r10*8+16]
59	mov     r9, [rdx+r10*8+16]
60	movdqu  xmm2, [rdx+r10*8]
61	pandn   xmm0, xmm2
62	movdqu  [rcx+r10*8], xmm0
63	not     rax
64	and     rax, r9
65	mov     [rcx+r10*8+16], rax
66.4:	ret
67
68.5:	movdqu  xmm0, [r8+r10*8]
69	movdqu  xmm2, [rdx+r10*8]
70	pandn   xmm0, xmm2
71	movdqu  [rcx+r10*8], xmm0
72	ret
73
74.6:	mov     rax, [r8+r10*8]
75	mov     r9, [rdx+r10*8]
76	not     rax
77	and     rax, r9
78	mov     [rcx+r10*8], rax
79	ret
80
81	end
82