1f81b1c5bSmrgdnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
2f81b1c5bSmrg
3f81b1c5bSmrgdnl  Copyright 2013, 2014 Free Software Foundation, Inc.
4f81b1c5bSmrg
5f81b1c5bSmrgdnl  This file is part of the GNU MP Library.
6f81b1c5bSmrgdnl
7f81b1c5bSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8f81b1c5bSmrgdnl  it under the terms of either:
9f81b1c5bSmrgdnl
10f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
11f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
12f81b1c5bSmrgdnl      option) any later version.
13f81b1c5bSmrgdnl
14f81b1c5bSmrgdnl  or
15f81b1c5bSmrgdnl
16f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
17f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
18f81b1c5bSmrgdnl      later version.
19f81b1c5bSmrgdnl
20f81b1c5bSmrgdnl  or both in parallel, as here.
21f81b1c5bSmrgdnl
22f81b1c5bSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
23f81b1c5bSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25f81b1c5bSmrgdnl  for more details.
26f81b1c5bSmrgdnl
27f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
28f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
30f81b1c5bSmrg
31f81b1c5bSmrginclude(`../config.m4')
32f81b1c5bSmrg
33f81b1c5bSmrgC	     cycles/limb
34*671ea119SmrgC Cortex-A53	 4.5
35*671ea119SmrgC Cortex-A57	 1.9
36*671ea119SmrgC X-Gene	 4.36
37f81b1c5bSmrg
38f81b1c5bSmrgC TODO
39f81b1c5bSmrgC  * Consider greater unrolling.
40f81b1c5bSmrgC  * Arrange to align the pointer, if that helps performance.  Use the same
41f81b1c5bSmrgC    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
42f81b1c5bSmrgC    valgrind!)
43f81b1c5bSmrgC  * Explore if explicit align directives, e.g., "[ptr:128]" help.
44f81b1c5bSmrgC  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
45f81b1c5bSmrg
46*671ea119Smrgchangecom(blah)
47f81b1c5bSmrg
48f81b1c5bSmrgC INPUT PARAMETERS
49f81b1c5bSmrgdefine(`ap', x0)
50f81b1c5bSmrgdefine(`bp', x1)
51f81b1c5bSmrgdefine(`n',  x2)
52f81b1c5bSmrg
53f81b1c5bSmrgC We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
54f81b1c5bSmrgC up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
55f81b1c5bSmrgC (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
56f81b1c5bSmrgC  allows the huge count code to jump deep into the code (at L(chu)).
57f81b1c5bSmrg
58f81b1c5bSmrgdefine(`maxsize',  0x1fff)
59f81b1c5bSmrgdefine(`chunksize',0x1ff0)
60f81b1c5bSmrg
61f81b1c5bSmrgASM_START()
62f81b1c5bSmrgPROLOGUE(mpn_hamdist)
63f81b1c5bSmrg
64f81b1c5bSmrg	mov	x11, #maxsize
65f81b1c5bSmrg	cmp	n, x11
66f81b1c5bSmrg	b.hi	L(gt8k)
67f81b1c5bSmrg
68f81b1c5bSmrgL(lt8k):
69f81b1c5bSmrg	movi	v4.16b, #0			C clear summation register
70f81b1c5bSmrg	movi	v5.16b, #0			C clear summation register
71f81b1c5bSmrg
72f81b1c5bSmrg	tbz	n, #0, L(xx0)
73f81b1c5bSmrg	sub	n, n, #1
74f81b1c5bSmrg	ld1	{v0.1d}, [ap], #8		C load 1 limb
75f81b1c5bSmrg	ld1	{v16.1d}, [bp], #8		C load 1 limb
76f81b1c5bSmrg	eor	v0.16b, v0.16b, v16.16b
77f81b1c5bSmrg	cnt	v6.16b, v0.16b
78f81b1c5bSmrg	uadalp	v4.8h,  v6.16b			C could also splat
79f81b1c5bSmrg
80f81b1c5bSmrgL(xx0):	tbz	n, #1, L(x00)
81f81b1c5bSmrg	sub	n, n, #2
82f81b1c5bSmrg	ld1	{v0.2d}, [ap], #16		C load 2 limbs
83f81b1c5bSmrg	ld1	{v16.2d}, [bp], #16		C load 2 limbs
84f81b1c5bSmrg	eor	v0.16b, v0.16b, v16.16b
85f81b1c5bSmrg	cnt	v6.16b, v0.16b
86f81b1c5bSmrg	uadalp	v4.8h,  v6.16b
87f81b1c5bSmrg
88f81b1c5bSmrgL(x00):	tbz	n, #2, L(000)
89f81b1c5bSmrg	subs	n, n, #4
90f81b1c5bSmrg	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
91f81b1c5bSmrg	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
92f81b1c5bSmrg	b.ls	L(sum)
93f81b1c5bSmrg
94f81b1c5bSmrgL(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
95f81b1c5bSmrg	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
96f81b1c5bSmrg	eor	v0.16b, v0.16b, v16.16b
97f81b1c5bSmrg	eor	v1.16b, v1.16b, v17.16b
98f81b1c5bSmrg	sub	n, n, #4
99f81b1c5bSmrg	cnt	v6.16b, v0.16b
100f81b1c5bSmrg	cnt	v7.16b, v1.16b
101f81b1c5bSmrg	b	L(mid)
102f81b1c5bSmrg
103f81b1c5bSmrgL(000):	subs	n, n, #8
104f81b1c5bSmrg	b.lo	L(e0)
105f81b1c5bSmrg
106f81b1c5bSmrgL(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
107f81b1c5bSmrg	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
108f81b1c5bSmrg	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
109f81b1c5bSmrg	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
110f81b1c5bSmrg	eor	v2.16b, v2.16b, v18.16b
111f81b1c5bSmrg	eor	v3.16b, v3.16b, v19.16b
112f81b1c5bSmrg	cnt	v6.16b, v2.16b
113f81b1c5bSmrg	cnt	v7.16b, v3.16b
114f81b1c5bSmrg	subs	n, n, #8
115f81b1c5bSmrg	b.lo	L(end)
116f81b1c5bSmrg
117f81b1c5bSmrgL(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
118f81b1c5bSmrg	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
119f81b1c5bSmrg	eor	v0.16b, v0.16b, v16.16b
120f81b1c5bSmrg	eor	v1.16b, v1.16b, v17.16b
121f81b1c5bSmrg	uadalp	v4.8h,  v6.16b
122f81b1c5bSmrg	cnt	v6.16b, v0.16b
123f81b1c5bSmrg	uadalp	v5.8h,  v7.16b
124f81b1c5bSmrg	cnt	v7.16b, v1.16b
125f81b1c5bSmrgL(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
126f81b1c5bSmrg	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
127f81b1c5bSmrg	eor	v2.16b, v2.16b, v18.16b
128f81b1c5bSmrg	eor	v3.16b, v3.16b, v19.16b
129f81b1c5bSmrg	subs	n, n, #8
130f81b1c5bSmrg	uadalp	v4.8h,  v6.16b
131f81b1c5bSmrg	cnt	v6.16b, v2.16b
132f81b1c5bSmrg	uadalp	v5.8h,  v7.16b
133f81b1c5bSmrg	cnt	v7.16b, v3.16b
134f81b1c5bSmrg	b.hs	L(top)
135f81b1c5bSmrg
136f81b1c5bSmrgL(end):	uadalp	v4.8h,  v6.16b
137f81b1c5bSmrg	uadalp	v5.8h,  v7.16b
138f81b1c5bSmrgL(sum):	eor	v0.16b, v0.16b, v16.16b
139f81b1c5bSmrg	eor	v1.16b, v1.16b, v17.16b
140f81b1c5bSmrg	cnt	v6.16b, v0.16b
141f81b1c5bSmrg	cnt	v7.16b, v1.16b
142f81b1c5bSmrg	uadalp	v4.8h,  v6.16b
143f81b1c5bSmrg	uadalp	v5.8h,  v7.16b
144f81b1c5bSmrg	add	v4.8h, v4.8h, v5.8h
145f81b1c5bSmrg					C we have 8 16-bit counts
146f81b1c5bSmrgL(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
147f81b1c5bSmrg	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
148f81b1c5bSmrg	mov	x0, v4.d[0]
149f81b1c5bSmrg	mov	x1, v4.d[1]
150f81b1c5bSmrg	add	x0, x0, x1
151f81b1c5bSmrg	ret
152f81b1c5bSmrg
153f81b1c5bSmrgC Code for count > maxsize.  Splits operand and calls above code.
154f81b1c5bSmrgdefine(`ap2', x5)			C caller-saves reg not used above
155f81b1c5bSmrgdefine(`bp2', x6)			C caller-saves reg not used above
156f81b1c5bSmrgL(gt8k):
157f81b1c5bSmrg	mov	x8, x30
158f81b1c5bSmrg	mov	x7, n			C full count (caller-saves reg not used above)
159f81b1c5bSmrg	mov	x4, #0			C total sum  (caller-saves reg not used above)
160f81b1c5bSmrg	mov	x9, #chunksize*8	C caller-saves reg not used above
161f81b1c5bSmrg	mov	x10, #chunksize		C caller-saves reg not used above
162f81b1c5bSmrg
163f81b1c5bSmrg1:	add	ap2, ap, x9		C point at subsequent block
164f81b1c5bSmrg	add	bp2, bp, x9		C point at subsequent block
165f81b1c5bSmrg	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
166f81b1c5bSmrg	movi	v4.16b, #0		C clear chunk summation register
167f81b1c5bSmrg	movi	v5.16b, #0		C clear chunk summation register
168f81b1c5bSmrg	bl	L(chu)			C jump deep inside code
169f81b1c5bSmrg	add	x4, x4, x0
170f81b1c5bSmrg	mov	ap, ap2			C put chunk pointer in place for calls
171f81b1c5bSmrg	mov	bp, bp2			C put chunk pointer in place for calls
172f81b1c5bSmrg	sub	x7, x7, x10
173f81b1c5bSmrg	cmp	x7, x11
174f81b1c5bSmrg	b.hi	1b
175f81b1c5bSmrg
176f81b1c5bSmrg	mov	n, x7			C count for final invocation
177f81b1c5bSmrg	bl	L(lt8k)
178f81b1c5bSmrg	add	x0, x4, x0
179f81b1c5bSmrg	mov	x30, x8
180f81b1c5bSmrg	ret
181f81b1c5bSmrgEPILOGUE()
182