xref: /reactos/sdk/lib/ucrt/string/arm64/strlen.s (revision 4fec953e)
1;
2; strlen.asm
3;
4;      Copyright (c) Microsoft Corporation.  All rights reserved.
5;
6; Optimized strlen and strnlen implementations for ARM64.
7;
8#include "ksarm64.h"
9
10        ; size_t strlen(const char *str);                                // AV's when str == NULL
11        ; size_t strnlen(const char *str, size_t numberOfElements);      // AV's when str == NULL
12
13        ; This file could also define strnlen_s. strnlen_s is currently defined in the header string.h in C
14        ; using a check for null and a call to strnlen. This avoids making the call in the case where the string is null,
15        ; which should be infrequent. However it makes code larger by inlining that check everywhere strnlen_s is called.
16        ; A better alternative would be to modify the standard headers and define strnlen_s here. It would be just one
17        ; instruction because the required return value in x0 is already 0 when null is passed for the first parameter.
18        ;
19        ; EXPORT strnlen_s [FUNC]
20        ; LEAF_ENTRY strnlen_s
21        ; cbz     x0, AnyRet         ; add the label AnyRet in front of any ret instruction you want
22        ;                            ; fallthrough into strnlen code
23        ; ALTERNATE_ENTRY strnlen    ; change LEAF_ENTRY for strnlen to ALTERNATE_ENTRY
24        ; ...                        ; current body of strnlen
25        ; LEAF_END                   ; change strnlen leaf_end to strnlen_s.
26
27#if !defined(_M_ARM64EC)
28
29        EXPORT A64NAME(strlen)    [FUNC]
30        EXPORT A64NAME(strnlen)   [FUNC]
31
32#endif
33
34        SET_COMDAT_ALIGNMENT 5
35
36        ; With strlen we will usually read some bytes past the end of the string. To avoid getting an AV
37        ; when a byte-by-byte implementation would not, we must ensure that we never cross a page boundary with a
38        ; vector load, so we must align the vector loads to 16-byte-aligned boundaries.
39        ;
40        ; For strnlen we know the buffer length and so we won't read any bytes beyond the end of the buffer. This means
41        ; we have a choice whether to arrange our vector loads to be 16-byte aligned. (Note that on arm64 a vector load
42        ; only produces an alignment fault when the vector *elements* are misaligned, so a "16B" vector load will never
43        ; give an alignment fault for user memory). Aligning the vector loads on 16-byte boundaries saves one cycle
44        ; per vector load instruction. The cost of forcing 16-byte aligned loads is the 10 instructions preceding the
45        ; 'NoNeedToAlign' label below. On Cortex-A57, the execution latency of those 10 instructions is 27 cycles,
46        ; assuming no branch mispredict on the 'beq'. To account for the cost of an occasional mispredict we guess a
47        ; mispredict rate of 2% and a mispredict cost of 50 cycles, or 1 cycle per call amortized, 28 total. 28 * 16 = 448.
48        ; In this analysis we are ignoring the chance of extra cache misses due to loads crossing cache lines when
49        ; they are not 16-byte aligned. When the vector loads span cache line boundaries each cache line is referenced
50        ; one more time than it is when the loads are aligned. But we assume that the cache line stays loaded for the
51        ; short time we need to do all the references to it, and so one extra reference won't matter.
52        ; It is expected that the number of cycles (28) will stay about the same for future processor models. If it
53        ; changes radically, it will be worth converting the EQU to a global, using ldr to load it instead of a
54        ; mov-immediate, and dynamically setting the global during CRT startup based on processor model.
55
56__strnlen_forceAlignThreshold  EQU 448                     ; code below assumes must be >= 32
57
58        ; If a strlen is performed on an unterminated buffer, strlen may try to access an invalid address
59        ; and generate an access violation. The prime imperative is for us not to generate an AV by loading
60        ; characters beyond the end of a valid string. But also, in the case of an invalid string that should
61        ; generate an AV, we need to have the AV report the proper bad address. If we only perform 16-byte aligned
62        ; vector loads then the first time we touch an invalid page we will be loading from offset 0 in that
63        ; page, which is the correct address for the AV to report. Even though we will usually load some bytes
64        ; from beyond the end of the string, we won't load bytes beyond the end of the page unless the string
65        ; extends into the next page. This is the primary purpose for forcing the vector loads to be
66        ; 16-byte aligned.
67
68        ; There is a slight performance boost for using aligned vector loads vs. unaligned ones but
69        ; it is only worth the cost of aligning for longer strings (at least 512 chars).
70
71        ; For strings less than 16 characters long the byte-by-byte loop will be about as fast as the
72        ; compiler could produce, so we're not losing any performance vs. compiled C in any case.
73
74        ARM64EC_ENTRY_THUNK A64NAME(strlen),1,0
75        LEAF_ENTRY_COMDAT A64NAME(strlen)
76
77        ; check for empty string to avoid huge perf degradation in this case.
78        ldrb    w2, [x0], #0
79        cbz     w2, EmptyStr
80
81        mov     x5, x0                                      ; keep original x0 value for the final 'sub'
82        ; calculate number of bytes until first 16-byte alignment point
83
84        ands    x1, x5, #15                                 ; x1 = (addr mod 16)
85        beq     StrlenMainLoop                              ; no need to force alignment if already aligned
86
87        ; we need to align, check whether we are within 16 bytes of the end of the page
88
89        ands    x2, x5, #4095
90        cmp     x2, #4080
91        bgt     AlignByteByByte                             ; too close to end of page, must align byte-by-byte
92
93        ; safe to do one unaligned 16-byte vector load to force alignment
94
95        ld1     v0.16b, [x5]                                ; don't post-increment x5
96        uminv   b1, v0.16b
97        fmov    w2, s1                                      ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]'
98        cbz     w2, FindNullInVector                        ; jump when string <= 15 bytes long & not near end of page
99        add     x5, x5, #16                                 ; move x5 forward only to aligned address
100        and     x5, x5, 0xFFFFFFFFFFFFFFF0                  ; first iter of StrlenMainLoop will retest some bytes we already tested
101
102StrlenMainLoop                                              ; test 16 bytes at a time until we find it
103        ld1     v0.16b, [x5], #16
104        uminv   b1, v0.16b                                  ; use unsigned min to look for a zero byte; too bad it doesn't set CC
105        fmov    w2, s1                                      ; need to move min byte into gpr to test it
106        cbnz    w2, StrlenMainLoop                          ; fall through when any one of the bytes in v0 is zero
107
108        sub     x5, x5, #16                                 ; undo the last #16 post-increment of x5
109
110FindNullInVector                                            ; this label is also the target of a jump from strnlen
111        ldr     q1, ReverseBytePos                          ; load the position indicator mask
112
113        cmeq    v0.16b, v0.16b, #0                          ; +----
114        and     v0.16b, v0.16b, v1.16b                      ; |
115        umaxv   b0, v0.16b                                  ; | see big comment below
116        fmov    w2, s0                                      ; |
117        eor     w2, w2, #15                                 ; +----
118
119        add     x5, x5, x2                                  ; which is the offset we need to add to x5 to point at the null byte
120        sub     x0, x5, x0                                  ; subtract ptr to null char from ptr to first char to get the strlen
121        ret
122
123ByteByByteFoundIt                                           ; this label is also the target of a jump from strnlen
124        sub     x5, x5, #1                                  ; Undo the final post-increment that happened on the load of the null char.
125        sub     x0, x5, x0                                  ; With x5 pointing at the null char, x5-x0 is the strlen
126        ret
127
128AlignByteByByte
129        sub     x1, x1, #16                                 ; x1 = (addr mod 16) - 16
130        neg     x1, x1                                      ; x1 = 16 - (addr mod 16) = count for byte-by-byte loop
131ByteByByteLoop                                              ; test one byte at a time until we are 16-byte aligned
132        ldrb    w2, [x5], #1
133        cbz     w2, ByteByByteFoundIt                       ; branch if byte-at-a-time testing finds the null
134        subs    x1, x1, #1
135        bgt     ByteByByteLoop                              ; fall through when not found and 16-byte aligned
136        b       StrlenMainLoop
137
138EmptyStr
139        mov     x0, 0
140        ret
141
142        ; The challenge is to find a way to efficiently determine which of the 16 bytes we loaded is the end of the string.
143        ; The trick is to load a position indicator mask and generate the position of the rightmost null from that.
144        ; Little-endian order means when we load the mask below v1.16b[0] has 0x0F, and v0.16b[0] is the byte of the string
145        ; that comes first of the 16 we loaded. We do a cmeq, mapping all the characters we loaded to either 0xFF (for nulls)
146        ; or 0x00 for non-nulls. Then we and with the mask below. SIMD lanes corresponding to a non-null character will be 0,
147        ; and SIMD lanes corresponding to null bytes will have a byte from the mask. We take the max across the bytes of the
148        ; vector to find the highest position that corresponds to a null character. The numbering order means we find the
149        ; rightmost null in the vector, which is the null that occurred first in memory due to little endian loading.
150        ; Exclusive oring the position indicator byte with 15 inverts the order, which gives us the offset of the null
151        ; counting from the first character we loaded into the v0 SIMD reg.
152
153ReverseBytePos \
154        dcb     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
155
156        LEAF_END
157
158
159        ARM64EC_ENTRY_THUNK A64NAME(strnlen),1,0
160        LEAF_ENTRY_COMDAT A64NAME(strnlen)                  ; start here for strnlen
161
162        mov     x5, x0
163        cmp     x1, #16                                     ; x1 has length. When < 16 we have to go byte-by-byte
164        blo     ShortStrNLen                                ; only do byte-by-byte for 0 to 15 bytes.
165
166        ; Do an aligned strnlen
167
168        ands    x3, x5, #15                                 ; x3 = start address mod 16
169        beq     NoNeedToAlign                               ; branch on x3 == 0 because its already aligned
170
171        ands    x2, x5, #4095                               ; x2 = start address mod (PAGE_SIZE - 1)
172        cmp     x2, #4080
173        bgt     AlignByteByByte_Strnlen                     ; too close to end of page, must align byte-by-byte
174
175        sub     x3, x3, #16                                 ; x3 = (start address mod 16) - 16
176        neg     x3, x3                                      ; x3 = 16 - (start address mod 16) = number of bytes to advance to get aligned
177        ld1     v0.16b, [x5]                                ; don't post-increment
178        uminv   b1, v0.16b
179        fmov    w2, s1                                      ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]'
180        cbz     w2, FindNullInVector_Strnlen                ; jump out when null found in first 16 bytes
181        sub     x1, x1, x3                                  ; reduce length remaining by number of bytes needed to get aligned
182        add     x5, x5, x3                                  ; move x5 forward only to aligned address
183ResumeAfterAlignByteByByte
184        cmp     x1, #16                                     ; check for size < 16 after alignment adjustment
185        blo     ShortStrNLen
186NoNeedToAlign
187        asr     x3, x1, #4                                  ; set up number of interations remaining after alignment point reached
188                                                            ; no need to check here for x3 == 0 because:
189                                                            ; - if we didn't align it, it is at least 16 bytes long
190                                                            ; - if we did align it, we checked for <16 before coming here
191StrNlenMainLoop                                             ; test 16 bytes at a time until we find it
192        ld1     v0.16b, [x5], #16
193        uminv   b1, v0.16b                                  ; use unsigned min to look for a zero byte; too bad it doesn't set CC
194        fmov    w2, s1                                      ; need to move min byte into gpr to test it
195        cbz     w2, UndoPI_FindNullInVector                 ; jump out to when any one of the bytes in v0 is zero
196        subs    x3, x3, #1
197        bne     StrNlenMainLoop
198
199        ands    x1, x1, #15                                 ; check for remainder
200        beq     StrNLenOverrun                              ; orig buffer size was multiple of 16 bytes so no remainder; goto overrun case
201
202        ; We're within 16 bytes of the end of the buffer and haven't found a '\0' yet. We know we were originally longer than
203        ; 16 bytes so we can do an unaligned vector compare of the last 16 bytes of the buffer, overlapping with some bytes
204        ; we already know are non-zero, without fear of underrunning the original front of the buffer. This avoids a more costly
205        ; byte-by-byte comparison for the remainder (which would average 32 instructions executed and two branch mispredicts).
206        ; At this point:
207        ; x5 points at one of the last 15 chars of the buffer
208        ; x1 has the number of chars remaining in the buffer. 1 <= x1 <= 15
209        ; 16 - x1 is the number of characters we have to 'back up'
210FastRemainderHandling
211        sub     x1, x1, #16
212        neg     x1, x1
213        sub     x5, x5, x1
214        ld1     v0.16b, [x5], #16
215        uminv   b1, v0.16b
216        fmov    w2, s1                                      ; fmov is sometimes 1 cycle faster than 'umov w2, v1.b[0]'
217        cbz     w2, UndoPI_FindNullInVector                 ; found a '\0'
218        b       StrNLenOverrun                              ; x5 points one past end of buffer, we're all set for the overrun exit.
219
220ShortStrNLen
221        cbz     x1, StrNLenOverrun                          ; if original length was zero, we must return 0 without touching the buffer
222
223ShortStrNLenLoop
224        ldrb    w2, [x5], #1
225        cbz     w2, ByteByByteFoundIt_Strnlen               ; jump into other function to avoid code duplication
226        subs    x1, x1, #1
227        bhi     ShortStrNLenLoop
228
229StrNLenOverrun
230        sub     x0, x5, x0                                  ; x5 points one past the end of the buffer, x5-x0 is original numberOfElements
231        ret
232
233AlignByteByByte_Strnlen
234        sub     x3, x3, #16                                 ; x3 = (addr mod 16) - 16
235        neg     x3, x3                                      ; x3 = 16 - (addr mod 16) = count for byte-by-byte loop
236ByteByByteLoop_Strnlen                                      ; test one byte at a time until we are 16-byte aligned
237        ldrb    w2, [x5], #1
238        cbz     w2, ByteByByteFoundIt_Strnlen               ; branch if byte-at-a-time testing finds the null
239        subs    x1, x1, #1                                  ; check remaining length = 0
240        beq     ByteByByteReachedMax_Strnlen                ; branch if byte-at-a-time testing reached end of buffer count
241        subs    x3, x3, #1
242        bgt     ByteByByteLoop_Strnlen                      ; fall through when not found and 16-byte aligned
243        b       ResumeAfterAlignByteByByte
244
245UndoPI_FindNullInVector                                     ; this label is the target of a jump from strnlen
246        sub     x5, x5, #16                                 ; undo the last #16 post-increment of x5
247
248FindNullInVector_Strnlen                                    ; this label is also the target of a jump from strnlen
249        ldr     q1, ReverseBytePos_Strnlen                  ; load the position indicator mask
250
251        cmeq    v0.16b, v0.16b, #0                          ; +----
252        and     v0.16b, v0.16b, v1.16b                      ; |
253        umaxv   b0, v0.16b                                  ; | see big comment below
254        fmov    w2, s0                                      ; |
255        eor     w2, w2, #15                                 ; +----
256
257        add     x5, x5, x2                                  ; which is the offset we need to add to x5 to point at the null byte
258        sub     x0, x5, x0                                  ; subtract ptr to null char from ptr to first char to get the strlen
259        ret
260
261ByteByByteFoundIt_Strnlen                                   ; this label is also the target of a jump from strnlen
262        sub     x5, x5, #1                                  ; Undo the final post-increment that happened on the load of the null char.
263ByteByByteReachedMax_Strnlen
264        sub     x0, x5, x0                                  ; With x5 pointing at the null char, x5-x0 is the strlen
265        ret
266
267ReverseBytePos_Strnlen \
268        dcb     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
269
270        LEAF_END
271
272        END
273