1/* 2 * strchrnul - find a character or nul in a string 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64 12 * Neon Available. 13 */ 14 15#include "../asmdefs.h" 16 17/* Arguments and results. */ 18#define srcin x0 19#define chrin w1 20 21#define result x0 22 23#define src x2 24#define tmp1 x3 25#define wtmp2 w4 26#define tmp3 x5 27 28#define vrepchr v0 29#define vdata1 v1 30#define vdata2 v2 31#define vhas_nul1 v3 32#define vhas_nul2 v4 33#define vhas_chr1 v5 34#define vhas_chr2 v6 35#define vrepmask v7 36#define vend1 v16 37 38/* Core algorithm. 39 40 For each 32-byte hunk we calculate a 64-bit syndrome value, with 41 two bits per byte (LSB is always in bits 0 and 1, for both big 42 and little-endian systems). For each tuple, bit 0 is set iff 43 the relevant byte matched the requested character or nul. Since the 44 bits in the syndrome reflect exactly the order in which things occur 45 in the original string a count_trailing_zeros() operation will 46 identify exactly which byte is causing the termination. */ 47 48/* Locals and temporaries. */ 49 50ENTRY (__strchrnul_aarch64) 51 /* Magic constant 0x40100401 to allow us to identify which lane 52 matches the termination condition. */ 53 mov wtmp2, #0x0401 54 movk wtmp2, #0x4010, lsl #16 55 dup vrepchr.16b, chrin 56 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 57 dup vrepmask.4s, wtmp2 58 ands tmp1, srcin, #31 59 b.eq L(loop) 60 61 /* Input string is not 32-byte aligned. Rather than forcing 62 the padding bytes to a safe value, we calculate the syndrome 63 for all the bytes, but then mask off those bits of the 64 syndrome that are related to the padding. */ 65 ld1 {vdata1.16b, vdata2.16b}, [src], #32 66 neg tmp1, tmp1 67 cmeq vhas_nul1.16b, vdata1.16b, #0 68 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 69 cmeq vhas_nul2.16b, vdata2.16b, #0 70 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 71 orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b 72 orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b 73 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 74 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 75 lsl tmp1, tmp1, #1 76 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 77 mov tmp3, #~0 78 addp vend1.16b, vend1.16b, vend1.16b // 128->64 79 lsr tmp1, tmp3, tmp1 80 81 mov tmp3, vend1.d[0] 82 bic tmp1, tmp3, tmp1 // Mask padding bits. 83 cbnz tmp1, L(tail) 84 85L(loop): 86 ld1 {vdata1.16b, vdata2.16b}, [src], #32 87 cmeq vhas_nul1.16b, vdata1.16b, #0 88 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 89 cmeq vhas_nul2.16b, vdata2.16b, #0 90 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 91 /* Use a fast check for the termination condition. */ 92 orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b 93 orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b 94 orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b 95 addp vend1.2d, vend1.2d, vend1.2d 96 mov tmp1, vend1.d[0] 97 cbz tmp1, L(loop) 98 99 /* Termination condition found. Now need to establish exactly why 100 we terminated. */ 101 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 102 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 103 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 104 addp vend1.16b, vend1.16b, vend1.16b // 128->64 105 106 mov tmp1, vend1.d[0] 107L(tail): 108 /* Count the trailing zeros, by bit reversing... */ 109 rbit tmp1, tmp1 110 /* Re-bias source. */ 111 sub src, src, #32 112 clz tmp1, tmp1 /* ... and counting the leading zeros. */ 113 /* tmp1 is twice the offset into the fragment. */ 114 add result, src, tmp1, lsr #1 115 ret 116 117END (__strchrnul_aarch64) 118