1/* $OpenBSD: strlen.S,v 1.8 2018/07/03 23:14:05 mortimer Exp $ */ 2/* $NetBSD: strlen.S,v 1.6 2014/03/22 19:16:34 jakllsch Exp $ */ 3 4/*- 5 * Copyright (c) 2009 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by David Laight. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33/* 34 * Inspired by a version written by J.T. Conklin <jtc@acorntoolworks.com> 35 * (Only the long comment really remains his work!) 36 */ 37 38#include "DEFS.h" 39 40/* 41 * There are many well known branch-free sequences which are used 42 * for determining whether a zero-byte is contained within a word. 43 * These sequences are generally much more efficent than loading 44 * and comparing each byte individually. 45 * 46 * The expression [1,2]: 47 * 48 * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f)) 49 * 50 * evaluates to a non-zero value if any of the bytes in the 51 * original word is zero. 52 * 53 * It also has the useful property that bytes in the result word 54 * that correspond to non-zero bytes in the original word have 55 * the value 0x00, while bytes corresponding to zero bytes have 56 * the value 0x80. This allows calculation of the first (and 57 * last) occurrence of a zero byte within the word (useful for C's 58 * str* primitives) by counting the number of leading (or 59 * trailing) zeros and dividing the result by 8. On machines 60 * without (or with slow) clz() / ctz() instructions, testing 61 * each byte in the result word for zero is necessary. 62 * 63 * This typically takes 4 instructions (5 on machines without 64 * "not-or") not including those needed to load the constant. 65 * 66 * 67 * The expression: 68 * 69 * (2) ((x - 0x01....01) & 0x80....80 & ~x) 70 * 71 * evaluates to a non-zero value if any of the bytes in the 72 * original word is zero. 73 * 74 * On little endian machines, the first byte in the result word 75 * that corresponds to a zero byte in the original byte is 0x80, 76 * so clz() can be used as above. On big endian machines, and 77 * little endian machines without (or with a slow) clz() insn, 78 * testing each byte in the original for zero is necessary. 79 * 80 * This typically takes 3 instructions (4 on machines without 81 * "and with complement") not including those needed to load 82 * constants. 83 * 84 * 85 * The expression: 86 * 87 * (3) ((x - 0x01....01) & 0x80....80) 88 * 89 * always evaluates to a non-zero value if any of the bytes in 90 * the original word is zero or has the top bit set. 91 * For strings that are likely to only contain 7-bit ascii these 92 * false positives will be rare. 93 * 94 * To account for possible false positives, each byte of the 95 * original word must be checked when the expression evaluates to 96 * a non-zero value. However, because it is simpler than those 97 * presented above, code that uses it will be faster as long as 98 * the rate of false positives is low. 99 * 100 * This is likely, because the false positive can only occur 101 * if the most siginificant bit of a byte within the word is set. 102 * The expression will never fail for typical 7-bit ASCII strings. 103 * 104 * This typically takes 2 instructions not including those needed 105 * to load constants. 106 * 107 * 108 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 109 * 110 * [2] International Business Machines, "The PowerPC Compiler Writer's 111 * Guide", Warthman Associates, 1996 112 */ 113 114ENTRY(strlen) 115 RETGUARD_SETUP(strlen, r11) 116 movabsq $0x0101010101010101,%r8 117 118 test $7,%dil 119 movq %rdi,%rax /* Buffer, %rdi unchanged */ 120 movabsq $0x8080808080808080,%r9 121 jnz 10f /* Jump if misaligned */ 122 123 _ALIGN_TEXT 1241: 125 movq (%rax),%rdx /* get bytes to check */ 1262: 127 addq $8,%rax 128 mov %rdx,%rcx /* save for later check */ 129 subq %r8,%rdx /* alg (3) above first */ 130 not %rcx /* Invert of data */ 131 andq %r9,%rdx 132 je 1b /* jump if all 0x01-0x80 */ 133 134 /* Do check from alg (2) above - loops for 0x81..0xff bytes */ 135 andq %rcx,%rdx 136 je 1b 137 138 /* Since we are LE, use bit scan for first 0x80 byte */ 139 sub %rdi,%rax /* length to next word */ 140 bsf %rdx,%rdx /* 7, 15, 23 ... 63 */ 141 shr $3,%rdx /* 0, 1, 2 ... 7 */ 142 lea -8(%rax,%rdx),%rax 143 RETGUARD_CHECK(strlen, r11) 144 ret 145 146/* Misaligned, read aligned word and make low bytes non-zero */ 147 _ALIGN_TEXT 14810: 149 mov %al,%cl 150 mov $1,%rsi 151 and $7,%cl /* offset into word 1..7 */ 152 and $~7,%al /* start of word with buffer */ 153 shl $3,%cl /* bit count 8, 16 .. 56 */ 154 movq (%rax),%rdx /* first data in high bytes */ 155 shl %cl,%rsi 156 dec %rsi 157 or %rsi,%rdx /* low bytes now non-zero */ 158 jmp 2b 159END_STRONG(strlen) 160