1f81b1c5bSmrgdnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance. 2f81b1c5bSmrg 3f81b1c5bSmrgdnl Copyright 2013, 2014 Free Software Foundation, Inc. 4f81b1c5bSmrg 5f81b1c5bSmrgdnl This file is part of the GNU MP Library. 6f81b1c5bSmrgdnl 7f81b1c5bSmrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 8f81b1c5bSmrgdnl it under the terms of either: 9f81b1c5bSmrgdnl 10f81b1c5bSmrgdnl * the GNU Lesser General Public License as published by the Free 11f81b1c5bSmrgdnl Software Foundation; either version 3 of the License, or (at your 12f81b1c5bSmrgdnl option) any later version. 13f81b1c5bSmrgdnl 14f81b1c5bSmrgdnl or 15f81b1c5bSmrgdnl 16f81b1c5bSmrgdnl * the GNU General Public License as published by the Free Software 17f81b1c5bSmrgdnl Foundation; either version 2 of the License, or (at your option) any 18f81b1c5bSmrgdnl later version. 19f81b1c5bSmrgdnl 20f81b1c5bSmrgdnl or both in parallel, as here. 21f81b1c5bSmrgdnl 22f81b1c5bSmrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 23f81b1c5bSmrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24f81b1c5bSmrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25f81b1c5bSmrgdnl for more details. 26f81b1c5bSmrgdnl 27f81b1c5bSmrgdnl You should have received copies of the GNU General Public License and the 28f81b1c5bSmrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29f81b1c5bSmrgdnl see https://www.gnu.org/licenses/. 30f81b1c5bSmrg 31f81b1c5bSmrginclude(`../config.m4') 32f81b1c5bSmrg 33f81b1c5bSmrgC cycles/limb 34*671ea119SmrgC Cortex-A53 4.5 35*671ea119SmrgC Cortex-A57 1.9 36*671ea119SmrgC X-Gene 4.36 37f81b1c5bSmrg 38f81b1c5bSmrgC TODO 39f81b1c5bSmrgC * Consider greater unrolling. 40f81b1c5bSmrgC * Arrange to align the pointer, if that helps performance. Use the same 41f81b1c5bSmrgC read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 42f81b1c5bSmrgC valgrind!) 43f81b1c5bSmrgC * Explore if explicit align directives, e.g., "[ptr:128]" help. 44f81b1c5bSmrgC * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 45f81b1c5bSmrg 46*671ea119Smrgchangecom(blah) 47f81b1c5bSmrg 48f81b1c5bSmrgC INPUT PARAMETERS 49f81b1c5bSmrgdefine(`ap', x0) 50f81b1c5bSmrgdefine(`bp', x1) 51f81b1c5bSmrgdefine(`n', x2) 52f81b1c5bSmrg 53f81b1c5bSmrgC We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 54f81b1c5bSmrgC up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 55f81b1c5bSmrgC (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 56f81b1c5bSmrgC allows the huge count code to jump deep into the code (at L(chu)). 57f81b1c5bSmrg 58f81b1c5bSmrgdefine(`maxsize', 0x1fff) 59f81b1c5bSmrgdefine(`chunksize',0x1ff0) 60f81b1c5bSmrg 61f81b1c5bSmrgASM_START() 62f81b1c5bSmrgPROLOGUE(mpn_hamdist) 63f81b1c5bSmrg 64f81b1c5bSmrg mov x11, #maxsize 65f81b1c5bSmrg cmp n, x11 66f81b1c5bSmrg b.hi L(gt8k) 67f81b1c5bSmrg 68f81b1c5bSmrgL(lt8k): 69f81b1c5bSmrg movi v4.16b, #0 C clear summation register 70f81b1c5bSmrg movi v5.16b, #0 C clear summation register 71f81b1c5bSmrg 72f81b1c5bSmrg tbz n, #0, L(xx0) 73f81b1c5bSmrg sub n, n, #1 74f81b1c5bSmrg ld1 {v0.1d}, [ap], #8 C load 1 limb 75f81b1c5bSmrg ld1 {v16.1d}, [bp], #8 C load 1 limb 76f81b1c5bSmrg eor v0.16b, v0.16b, v16.16b 77f81b1c5bSmrg cnt v6.16b, v0.16b 78f81b1c5bSmrg uadalp v4.8h, v6.16b C could also splat 79f81b1c5bSmrg 80f81b1c5bSmrgL(xx0): tbz n, #1, L(x00) 81f81b1c5bSmrg sub n, n, #2 82f81b1c5bSmrg ld1 {v0.2d}, [ap], #16 C load 2 limbs 83f81b1c5bSmrg ld1 {v16.2d}, [bp], #16 C load 2 limbs 84f81b1c5bSmrg eor v0.16b, v0.16b, v16.16b 85f81b1c5bSmrg cnt v6.16b, v0.16b 86f81b1c5bSmrg uadalp v4.8h, v6.16b 87f81b1c5bSmrg 88f81b1c5bSmrgL(x00): tbz n, #2, L(000) 89f81b1c5bSmrg subs n, n, #4 90f81b1c5bSmrg ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 91f81b1c5bSmrg ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 92f81b1c5bSmrg b.ls L(sum) 93f81b1c5bSmrg 94f81b1c5bSmrgL(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 95f81b1c5bSmrg ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 96f81b1c5bSmrg eor v0.16b, v0.16b, v16.16b 97f81b1c5bSmrg eor v1.16b, v1.16b, v17.16b 98f81b1c5bSmrg sub n, n, #4 99f81b1c5bSmrg cnt v6.16b, v0.16b 100f81b1c5bSmrg cnt v7.16b, v1.16b 101f81b1c5bSmrg b L(mid) 102f81b1c5bSmrg 103f81b1c5bSmrgL(000): subs n, n, #8 104f81b1c5bSmrg b.lo L(e0) 105f81b1c5bSmrg 106f81b1c5bSmrgL(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 107f81b1c5bSmrg ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 108f81b1c5bSmrg ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 109f81b1c5bSmrg ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 110f81b1c5bSmrg eor v2.16b, v2.16b, v18.16b 111f81b1c5bSmrg eor v3.16b, v3.16b, v19.16b 112f81b1c5bSmrg cnt v6.16b, v2.16b 113f81b1c5bSmrg cnt v7.16b, v3.16b 114f81b1c5bSmrg subs n, n, #8 115f81b1c5bSmrg b.lo L(end) 116f81b1c5bSmrg 117f81b1c5bSmrgL(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 118f81b1c5bSmrg ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 119f81b1c5bSmrg eor v0.16b, v0.16b, v16.16b 120f81b1c5bSmrg eor v1.16b, v1.16b, v17.16b 121f81b1c5bSmrg uadalp v4.8h, v6.16b 122f81b1c5bSmrg cnt v6.16b, v0.16b 123f81b1c5bSmrg uadalp v5.8h, v7.16b 124f81b1c5bSmrg cnt v7.16b, v1.16b 125f81b1c5bSmrgL(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 126f81b1c5bSmrg ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 127f81b1c5bSmrg eor v2.16b, v2.16b, v18.16b 128f81b1c5bSmrg eor v3.16b, v3.16b, v19.16b 129f81b1c5bSmrg subs n, n, #8 130f81b1c5bSmrg uadalp v4.8h, v6.16b 131f81b1c5bSmrg cnt v6.16b, v2.16b 132f81b1c5bSmrg uadalp v5.8h, v7.16b 133f81b1c5bSmrg cnt v7.16b, v3.16b 134f81b1c5bSmrg b.hs L(top) 135f81b1c5bSmrg 136f81b1c5bSmrgL(end): uadalp v4.8h, v6.16b 137f81b1c5bSmrg uadalp v5.8h, v7.16b 138f81b1c5bSmrgL(sum): eor v0.16b, v0.16b, v16.16b 139f81b1c5bSmrg eor v1.16b, v1.16b, v17.16b 140f81b1c5bSmrg cnt v6.16b, v0.16b 141f81b1c5bSmrg cnt v7.16b, v1.16b 142f81b1c5bSmrg uadalp v4.8h, v6.16b 143f81b1c5bSmrg uadalp v5.8h, v7.16b 144f81b1c5bSmrg add v4.8h, v4.8h, v5.8h 145f81b1c5bSmrg C we have 8 16-bit counts 146f81b1c5bSmrgL(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 147f81b1c5bSmrg uaddlp v4.2d, v4.4s C we have 2 64-bit counts 148f81b1c5bSmrg mov x0, v4.d[0] 149f81b1c5bSmrg mov x1, v4.d[1] 150f81b1c5bSmrg add x0, x0, x1 151f81b1c5bSmrg ret 152f81b1c5bSmrg 153f81b1c5bSmrgC Code for count > maxsize. Splits operand and calls above code. 154f81b1c5bSmrgdefine(`ap2', x5) C caller-saves reg not used above 155f81b1c5bSmrgdefine(`bp2', x6) C caller-saves reg not used above 156f81b1c5bSmrgL(gt8k): 157f81b1c5bSmrg mov x8, x30 158f81b1c5bSmrg mov x7, n C full count (caller-saves reg not used above) 159f81b1c5bSmrg mov x4, #0 C total sum (caller-saves reg not used above) 160f81b1c5bSmrg mov x9, #chunksize*8 C caller-saves reg not used above 161f81b1c5bSmrg mov x10, #chunksize C caller-saves reg not used above 162f81b1c5bSmrg 163f81b1c5bSmrg1: add ap2, ap, x9 C point at subsequent block 164f81b1c5bSmrg add bp2, bp, x9 C point at subsequent block 165f81b1c5bSmrg mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 166f81b1c5bSmrg movi v4.16b, #0 C clear chunk summation register 167f81b1c5bSmrg movi v5.16b, #0 C clear chunk summation register 168f81b1c5bSmrg bl L(chu) C jump deep inside code 169f81b1c5bSmrg add x4, x4, x0 170f81b1c5bSmrg mov ap, ap2 C put chunk pointer in place for calls 171f81b1c5bSmrg mov bp, bp2 C put chunk pointer in place for calls 172f81b1c5bSmrg sub x7, x7, x10 173f81b1c5bSmrg cmp x7, x11 174f81b1c5bSmrg b.hi 1b 175f81b1c5bSmrg 176f81b1c5bSmrg mov n, x7 C count for final invocation 177f81b1c5bSmrg bl L(lt8k) 178f81b1c5bSmrg add x0, x4, x0 179f81b1c5bSmrg mov x30, x8 180f81b1c5bSmrg ret 181f81b1c5bSmrgEPILOGUE() 182