1/* 2 * Written by J.T. Conklin <jtc@acorntoolworks.com> 3 * Public domain. 4 */ 5 6#include <machine/asm.h> 7 8#if defined(LIBC_SCCS) 9 RCSID("$NetBSD: strlen.S,v 1.1 2005/12/20 19:28:49 christos Exp $") 10#endif 11 12ENTRY(strlen) 13 movl 4(%esp),%eax 14 15.Lalign: 16 /* Consider unrolling loop? */ 17 testb $3,%al 18 je .Lword_aligned 19 cmpb $0,(%eax) 20 je .Ldone 21 incl %eax 22 jmp .Lalign 23 24 /* 25 * There are many well known branch-free sequences which are used 26 * for determining whether a zero-byte is contained within a word. 27 * These sequences are generally much more efficent than loading 28 * and comparing each byte individually. 29 * 30 * The expression [1,2]: 31 * 32 * (1) ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | (x | 0x7f7f7f7f)) 33 * 34 * evaluates to a non-zero value if any of the bytes in the 35 * original word is zero. 36 * 37 * It also has the useful property that bytes in the result word 38 * that correspond to non-zero bytes in the original word have 39 * the value 0x00, while bytes corresponding to zero bytes have 40 * the value 0x80. This allows calculation of the first (and 41 * last) occurrence of a zero byte within the word (useful for C's 42 * str* primitives) by counting the number of leading (or 43 * trailing) zeros and dividing the result by 8. On machines 44 * without (or with slow) clz() / ctz() instructions, testing 45 * each byte in the result word for zero is necessary. 46 * 47 * This typically takes 4 instructions (5 on machines without 48 * "not-or") not including those needed to load the constant. 49 * 50 * 51 * The expression: 52 * 53 * (2) ((x - 0x01010101) & ~x & 0x80808080) 54 * 55 * evaluates to a non-zero value if any of the bytes in the 56 * original word is zero. 57 * 58 * On little endian machines, the first byte in the result word 59 * that corresponds to a zero byte in the original byte is 0x80, 60 * so clz() can be used as above. On big endian machines, and 61 * little endian machines without (or with a slow) clz() insn, 62 * testing each byte in the original for zero is necessary. 63 * 64 * This typically takes 3 instructions (4 on machines without 65 * "and with complement") not including those needed to load 66 * constants. 67 * 68 * 69 * The expression: 70 * 71 * (3) ((x - 0x01010101) & 0x80808080) 72 * 73 * always evaluates to a non-zero value if any of the bytes in 74 * the original word is zero. However, in rare cases, it also 75 * evaluates to a non-zero value when none of the bytes in the 76 * original word is zero. 77 * 78 * To account for possible false positives, each byte of the 79 * original word must be checked when the expression evaluates to 80 * a non-zero value. However, because it is simpler than those 81 * presented above, code that uses it will be faster as long as 82 * the rate of false positives is low. 83 * 84 * This is likely, because the the false positive can only occur 85 * if the most siginificant bit of a byte within the word is set. 86 * The expression will never fail for typical 7-bit ASCII strings. 87 * 88 * This typically takes 2 instructions not including those needed 89 * to load constants. 90 * 91 * 92 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 93 * 94 * [2] International Business Machines, "The PowerPC Compiler Writer's 95 * Guide", Warthman Associates, 1996 96 */ 97 98 _ALIGN_TEXT 99.Lword_aligned: 100.Lloop: 101 movl (%eax),%ecx 102 addl $4,%eax 103 leal -0x01010101(%ecx),%edx 104 testl $0x80808080,%edx 105 je .Lloop 106 107 /* 108 * In rare cases, the above loop may exit prematurely. We must 109 * return to the loop if none of the bytes in the word equal 0. 110 */ 111 112 /* 113 * The optimal code for determining whether each byte is zero 114 * differs by processor. This space-optimized code should be 115 * acceptable on all, especially since we don't expect it to 116 * be run frequently, 117 */ 118 119 testb %cl,%cl /* 1st byte == 0? */ 120 jne 1f 121 subl $4,%eax 122 jmp .Ldone 123 1241: testb %ch,%ch /* 2nd byte == 0? */ 125 jne 1f 126 subl $3,%eax 127 jmp .Ldone 128 1291: shrl $16,%ecx 130 testb %cl,%cl /* 3rd byte == 0? */ 131 jne 1f 132 subl $2,%eax 133 jmp .Ldone 134 1351: testb %ch,%ch /* 4th byte == 0? */ 136 jne .Lloop 137 decl %eax 138 139.Ldone: 140 subl 4(%esp),%eax 141 ret 142