1
2;  AMD64 mpn_diveby3
3;  Copyright 2009 Jason Moxham
4;  This file is part of the MPIR Library.
5;  The MPIR Library is free software; you can redistribute it and/or modify
6;  it under the terms of the GNU Lesser General Public License as published
7;  by the Free Software Foundation; either version 2.1 of the License, or (at
8;  your option) any later version.
9;  The MPIR Library is distributed in the hope that it will be useful, but
10;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
12;  License for more details.
13;  You should have received a copy of the GNU Lesser General Public License
14;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
15;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
16;  Boston, MA 02110-1301, USA.
17
18;	(rdi, rdx) = (rsi, rdx)  rcx = carry in
19;	rax = carry out
20;	NOTE could pass 55555...555 as next param so this would
21;	be mpn_divexact_by_ff_over_c , and change imul at end , or
22;	drop backwards compatibilty and just dump the two imuls
23
24%include 'yasm_mac.inc'
25
26    BITS    64
27
28    GLOBAL_FUNC mpn_divexact_by3c
29	mov     r9d, 3
30	lea     rsi, [rsi+rdx*8-24]
31	lea     rdi, [rdi+rdx*8-24]
32	mov     r8, 0x5555555555555555
33	imul    rcx, r8
34	sub     r9, rdx
35	jnc     skiploop
36	align   16
37loop1:
38	mov     rax, [rsi+r9*8]
39	mul     r8
40	sub     rcx, rax
41	mov     [rdi+r9*8], rcx
42	sbb     rcx, rdx
43	mov     rax, [rsi+r9*8+8]
44	mul     r8
45	sub     rcx, rax
46	mov     [rdi+r9*8+8], rcx
47	sbb     rcx, rdx
48	mov     rax, [rsi+r9*8+16]
49	mul     r8
50	sub     rcx, rax
51	mov     [rdi+r9*8+16], rcx
52	sbb     rcx, rdx
53	mov     rax, [rsi+r9*8+24]
54	mul     r8
55	sub     rcx, rax
56	mov     [rdi+r9*8+24], rcx
57	sbb     rcx, rdx
58	add     r9, 4
59	jnc     loop1
60skiploop:
61	test    r9, 2
62	jnz     skip
63	mov     rax, [rsi+r9*8]
64	mul     r8
65	sub     rcx, rax
66	mov     [rdi+r9*8], rcx
67	sbb     rcx, rdx
68	mov     rax, [rsi+r9*8+8]
69	mul     r8
70	sub     rcx, rax
71	mov     [rdi+r9*8+8], rcx
72	sbb     rcx, rdx
73	add     r9, 2
74skip:
75	test    r9, 1
76	jnz     end
77	mov     rax, [rsi+r9*8]
78	mul     r8
79	sub     rcx, rax
80	mov     [rdi+r9*8], rcx
81	sbb     rcx, rdx
82end:
83	; below is the same as   imul    rax, rcx, -3
84	lea     rax, [rcx+rcx*2]
85	neg     rax
86	ret
87