1; 2; strlen.asm 3; 4; Copyright (c) Microsoft Corporation. All rights reserved. 5; 6; Optimized strlen and strnlen implementations for ARM64. 7; 8#include "ksarm64.h" 9 10 ; size_t strlen(const char *str); // AV's when str == NULL 11 ; size_t strnlen(const char *str, size_t numberOfElements); // AV's when str == NULL 12 13 ; This file could also define strnlen_s. strnlen_s is currently defined in the header string.h in C 14 ; using a check for null and a call to strnlen. This avoids making the call in the case where the string is null, 15 ; which should be infrequent. However it makes code larger by inlining that check everywhere strnlen_s is called. 16 ; A better alternative would be to modify the standard headers and define strnlen_s here. It would be just one 17 ; instruction because the required return value in x0 is already 0 when null is passed for the first parameter. 18 ; 19 ; EXPORT strnlen_s [FUNC] 20 ; LEAF_ENTRY strnlen_s 21 ; cbz x0, AnyRet ; add the label AnyRet in front of any ret instruction you want 22 ; ; fallthrough into strnlen code 23 ; ALTERNATE_ENTRY strnlen ; change LEAF_ENTRY for strnlen to ALTERNATE_ENTRY 24 ; ... ; current body of strnlen 25 ; LEAF_END ; change strnlen leaf_end to strnlen_s. 26 27#if !defined(_M_ARM64EC) 28 29 EXPORT A64NAME(strlen) [FUNC] 30 EXPORT A64NAME(strnlen) [FUNC] 31 32#endif 33 34 SET_COMDAT_ALIGNMENT 5 35 36 ; With strlen we will usually read some bytes past the end of the string. To avoid getting an AV 37 ; when a byte-by-byte implementation would not, we must ensure that we never cross a page boundary with a 38 ; vector load, so we must align the vector loads to 16-byte-aligned boundaries. 39 ; 40 ; For strnlen we know the buffer length and so we won't read any bytes beyond the end of the buffer. This means 41 ; we have a choice whether to arrange our vector loads to be 16-byte aligned. (Note that on arm64 a vector load 42 ; only produces an alignment fault when the vector *elements* are misaligned, so a "16B" vector load will never 43 ; give an alignment fault for user memory). Aligning the vector loads on 16-byte boundaries saves one cycle 44 ; per vector load instruction. The cost of forcing 16-byte aligned loads is the 10 instructions preceding the 45 ; 'NoNeedToAlign' label below. On Cortex-A57, the execution latency of those 10 instructions is 27 cycles, 46 ; assuming no branch mispredict on the 'beq'. To account for the cost of an occasional mispredict we guess a 47 ; mispredict rate of 2% and a mispredict cost of 50 cycles, or 1 cycle per call amortized, 28 total. 28 * 16 = 448. 48 ; In this analysis we are ignoring the chance of extra cache misses due to loads crossing cache lines when 49 ; they are not 16-byte aligned. When the vector loads span cache line boundaries each cache line is referenced 50 ; one more time than it is when the loads are aligned. But we assume that the cache line stays loaded for the 51 ; short time we need to do all the references to it, and so one extra reference won't matter. 52 ; It is expected that the number of cycles (28) will stay about the same for future processor models. If it 53 ; changes radically, it will be worth converting the EQU to a global, using ldr to load it instead of a 54 ; mov-immediate, and dynamically setting the global during CRT startup based on processor model. 55 56__strnlen_forceAlignThreshold EQU 448 ; code below assumes must be >= 32 57 58 ; If a strlen is performed on an unterminated buffer, strlen may try to access an invalid address 59 ; and generate an access violation. The prime imperative is for us not to generate an AV by loading 60 ; characters beyond the end of a valid string. But also, in the case of an invalid string that should 61 ; generate an AV, we need to have the AV report the proper bad address. If we only perform 16-byte aligned 62 ; vector loads then the first time we touch an invalid page we will be loading from offset 0 in that 63 ; page, which is the correct address for the AV to report. Even though we will usually load some bytes 64 ; from beyond the end of the string, we won't load bytes beyond the end of the page unless the string 65 ; extends into the next page. This is the primary purpose for forcing the vector loads to be 66 ; 16-byte aligned. 67 68 ; There is a slight performance boost for using aligned vector loads vs. unaligned ones but 69 ; it is only worth the cost of aligning for longer strings (at least 512 chars). 70 71 ; For strings less than 16 characters long the byte-by-byte loop will be about as fast as the 72 ; compiler could produce, so we're not losing any performance vs. compiled C in any case. 73 74 ARM64EC_ENTRY_THUNK A64NAME(strlen),1,0 75 LEAF_ENTRY_COMDAT A64NAME(strlen) 76 77 ; check for empty string to avoid huge perf degradation in this case. 78 ldrb w2, [x0], #0 79 cbz w2, EmptyStr 80 81 mov x5, x0 ; keep original x0 value for the final 'sub' 82 ; calculate number of bytes until first 16-byte alignment point 83 84 ands x1, x5, #15 ; x1 = (addr mod 16) 85 beq StrlenMainLoop ; no need to force alignment if already aligned 86 87 ; we need to align, check whether we are within 16 bytes of the end of the page 88 89 ands x2, x5, #4095 90 cmp x2, #4080 91 bgt AlignByteByByte ; too close to end of page, must align byte-by-byte 92 93 ; safe to do one unaligned 16-byte vector load to force alignment 94 95 ld1 v0.16b, [x5] ; don't post-increment x5 96 uminv b1, v0.16b 97 fmov w2, s1 ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]' 98 cbz w2, FindNullInVector ; jump when string <= 15 bytes long & not near end of page 99 add x5, x5, #16 ; move x5 forward only to aligned address 100 and x5, x5, 0xFFFFFFFFFFFFFFF0 ; first iter of StrlenMainLoop will retest some bytes we already tested 101 102StrlenMainLoop ; test 16 bytes at a time until we find it 103 ld1 v0.16b, [x5], #16 104 uminv b1, v0.16b ; use unsigned min to look for a zero byte; too bad it doesn't set CC 105 fmov w2, s1 ; need to move min byte into gpr to test it 106 cbnz w2, StrlenMainLoop ; fall through when any one of the bytes in v0 is zero 107 108 sub x5, x5, #16 ; undo the last #16 post-increment of x5 109 110FindNullInVector ; this label is also the target of a jump from strnlen 111 ldr q1, ReverseBytePos ; load the position indicator mask 112 113 cmeq v0.16b, v0.16b, #0 ; +---- 114 and v0.16b, v0.16b, v1.16b ; | 115 umaxv b0, v0.16b ; | see big comment below 116 fmov w2, s0 ; | 117 eor w2, w2, #15 ; +---- 118 119 add x5, x5, x2 ; which is the offset we need to add to x5 to point at the null byte 120 sub x0, x5, x0 ; subtract ptr to null char from ptr to first char to get the strlen 121 ret 122 123ByteByByteFoundIt ; this label is also the target of a jump from strnlen 124 sub x5, x5, #1 ; Undo the final post-increment that happened on the load of the null char. 125 sub x0, x5, x0 ; With x5 pointing at the null char, x5-x0 is the strlen 126 ret 127 128AlignByteByByte 129 sub x1, x1, #16 ; x1 = (addr mod 16) - 16 130 neg x1, x1 ; x1 = 16 - (addr mod 16) = count for byte-by-byte loop 131ByteByByteLoop ; test one byte at a time until we are 16-byte aligned 132 ldrb w2, [x5], #1 133 cbz w2, ByteByByteFoundIt ; branch if byte-at-a-time testing finds the null 134 subs x1, x1, #1 135 bgt ByteByByteLoop ; fall through when not found and 16-byte aligned 136 b StrlenMainLoop 137 138EmptyStr 139 mov x0, 0 140 ret 141 142 ; The challenge is to find a way to efficiently determine which of the 16 bytes we loaded is the end of the string. 143 ; The trick is to load a position indicator mask and generate the position of the rightmost null from that. 144 ; Little-endian order means when we load the mask below v1.16b[0] has 0x0F, and v0.16b[0] is the byte of the string 145 ; that comes first of the 16 we loaded. We do a cmeq, mapping all the characters we loaded to either 0xFF (for nulls) 146 ; or 0x00 for non-nulls. Then we and with the mask below. SIMD lanes corresponding to a non-null character will be 0, 147 ; and SIMD lanes corresponding to null bytes will have a byte from the mask. We take the max across the bytes of the 148 ; vector to find the highest position that corresponds to a null character. The numbering order means we find the 149 ; rightmost null in the vector, which is the null that occurred first in memory due to little endian loading. 150 ; Exclusive oring the position indicator byte with 15 inverts the order, which gives us the offset of the null 151 ; counting from the first character we loaded into the v0 SIMD reg. 152 153ReverseBytePos \ 154 dcb 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 155 156 LEAF_END 157 158 159 ARM64EC_ENTRY_THUNK A64NAME(strnlen),1,0 160 LEAF_ENTRY_COMDAT A64NAME(strnlen) ; start here for strnlen 161 162 mov x5, x0 163 cmp x1, #16 ; x1 has length. When < 16 we have to go byte-by-byte 164 blo ShortStrNLen ; only do byte-by-byte for 0 to 15 bytes. 165 166 ; Do an aligned strnlen 167 168 ands x3, x5, #15 ; x3 = start address mod 16 169 beq NoNeedToAlign ; branch on x3 == 0 because its already aligned 170 171 ands x2, x5, #4095 ; x2 = start address mod (PAGE_SIZE - 1) 172 cmp x2, #4080 173 bgt AlignByteByByte_Strnlen ; too close to end of page, must align byte-by-byte 174 175 sub x3, x3, #16 ; x3 = (start address mod 16) - 16 176 neg x3, x3 ; x3 = 16 - (start address mod 16) = number of bytes to advance to get aligned 177 ld1 v0.16b, [x5] ; don't post-increment 178 uminv b1, v0.16b 179 fmov w2, s1 ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]' 180 cbz w2, FindNullInVector_Strnlen ; jump out when null found in first 16 bytes 181 sub x1, x1, x3 ; reduce length remaining by number of bytes needed to get aligned 182 add x5, x5, x3 ; move x5 forward only to aligned address 183ResumeAfterAlignByteByByte 184 cmp x1, #16 ; check for size < 16 after alignment adjustment 185 blo ShortStrNLen 186NoNeedToAlign 187 asr x3, x1, #4 ; set up number of interations remaining after alignment point reached 188 ; no need to check here for x3 == 0 because: 189 ; - if we didn't align it, it is at least 16 bytes long 190 ; - if we did align it, we checked for <16 before coming here 191StrNlenMainLoop ; test 16 bytes at a time until we find it 192 ld1 v0.16b, [x5], #16 193 uminv b1, v0.16b ; use unsigned min to look for a zero byte; too bad it doesn't set CC 194 fmov w2, s1 ; need to move min byte into gpr to test it 195 cbz w2, UndoPI_FindNullInVector ; jump out to when any one of the bytes in v0 is zero 196 subs x3, x3, #1 197 bne StrNlenMainLoop 198 199 ands x1, x1, #15 ; check for remainder 200 beq StrNLenOverrun ; orig buffer size was multiple of 16 bytes so no remainder; goto overrun case 201 202 ; We're within 16 bytes of the end of the buffer and haven't found a '\0' yet. We know we were originally longer than 203 ; 16 bytes so we can do an unaligned vector compare of the last 16 bytes of the buffer, overlapping with some bytes 204 ; we already know are non-zero, without fear of underrunning the original front of the buffer. This avoids a more costly 205 ; byte-by-byte comparison for the remainder (which would average 32 instructions executed and two branch mispredicts). 206 ; At this point: 207 ; x5 points at one of the last 15 chars of the buffer 208 ; x1 has the number of chars remaining in the buffer. 1 <= x1 <= 15 209 ; 16 - x1 is the number of characters we have to 'back up' 210FastRemainderHandling 211 sub x1, x1, #16 212 neg x1, x1 213 sub x5, x5, x1 214 ld1 v0.16b, [x5], #16 215 uminv b1, v0.16b 216 fmov w2, s1 ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]' 217 cbz w2, UndoPI_FindNullInVector ; found a '\0' 218 b StrNLenOverrun ; x5 points one past end of buffer, we're all set for the overrun exit. 219 220ShortStrNLen 221 cbz x1, StrNLenOverrun ; if original length was zero, we must return 0 without touching the buffer 222 223ShortStrNLenLoop 224 ldrb w2, [x5], #1 225 cbz w2, ByteByByteFoundIt_Strnlen ; jump into other function to avoid code duplication 226 subs x1, x1, #1 227 bhi ShortStrNLenLoop 228 229StrNLenOverrun 230 sub x0, x5, x0 ; x5 points one past the end of the buffer, x5-x0 is original numberOfElements 231 ret 232 233AlignByteByByte_Strnlen 234 sub x3, x3, #16 ; x3 = (addr mod 16) - 16 235 neg x3, x3 ; x3 = 16 - (addr mod 16) = count for byte-by-byte loop 236ByteByByteLoop_Strnlen ; test one byte at a time until we are 16-byte aligned 237 ldrb w2, [x5], #1 238 cbz w2, ByteByByteFoundIt_Strnlen ; branch if byte-at-a-time testing finds the null 239 subs x1, x1, #1 ; check remaining length = 0 240 beq ByteByByteReachedMax_Strnlen ; branch if byte-at-a-time testing reached end of buffer count 241 subs x3, x3, #1 242 bgt ByteByByteLoop_Strnlen ; fall through when not found and 16-byte aligned 243 b ResumeAfterAlignByteByByte 244 245UndoPI_FindNullInVector ; this label is the target of a jump from strnlen 246 sub x5, x5, #16 ; undo the last #16 post-increment of x5 247 248FindNullInVector_Strnlen ; this label is also the target of a jump from strnlen 249 ldr q1, ReverseBytePos_Strnlen ; load the position indicator mask 250 251 cmeq v0.16b, v0.16b, #0 ; +---- 252 and v0.16b, v0.16b, v1.16b ; | 253 umaxv b0, v0.16b ; | see big comment below 254 fmov w2, s0 ; | 255 eor w2, w2, #15 ; +---- 256 257 add x5, x5, x2 ; which is the offset we need to add to x5 to point at the null byte 258 sub x0, x5, x0 ; subtract ptr to null char from ptr to first char to get the strlen 259 ret 260 261ByteByByteFoundIt_Strnlen ; this label is also the target of a jump from strnlen 262 sub x5, x5, #1 ; Undo the final post-increment that happened on the load of the null char. 263ByteByByteReachedMax_Strnlen 264 sub x0, x5, x0 ; With x5 pointing at the null char, x5-x0 is the strlen 265 ret 266 267ReverseBytePos_Strnlen \ 268 dcb 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 269 270 LEAF_END 271 272 END 273