1/* 2 * strcmp for ARMv7 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 10 11/* Implementation of strcmp for ARMv7 when DSP instructions are 12 available. Use ldrd to support wider loads, provided the data 13 is sufficiently aligned. Use saturating arithmetic to optimize 14 the compares. */ 15 16#include "../asmdefs.h" 17 18/* Build Options: 19 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 20 byte in the string. If comparing completely random strings 21 the pre-check will save time, since there is a very high 22 probability of a mismatch in the first character: we save 23 significant overhead if this is the common case. However, 24 if strings are likely to be identical (eg because we're 25 verifying a hit in a hash table), then this check is largely 26 redundant. */ 27 28#define STRCMP_NO_PRECHECK 0 29 30 /* This version uses Thumb-2 code. */ 31 .thumb 32 .syntax unified 33 34#ifdef __ARM_BIG_ENDIAN 35#define S2LO lsl 36#define S2LOEQ lsleq 37#define S2HI lsr 38#define MSB 0x000000ff 39#define LSB 0xff000000 40#define BYTE0_OFFSET 24 41#define BYTE1_OFFSET 16 42#define BYTE2_OFFSET 8 43#define BYTE3_OFFSET 0 44#else /* not __ARM_BIG_ENDIAN */ 45#define S2LO lsr 46#define S2LOEQ lsreq 47#define S2HI lsl 48#define BYTE0_OFFSET 0 49#define BYTE1_OFFSET 8 50#define BYTE2_OFFSET 16 51#define BYTE3_OFFSET 24 52#define MSB 0xff000000 53#define LSB 0x000000ff 54#endif /* not __ARM_BIG_ENDIAN */ 55 56/* Parameters and result. */ 57#define src1 r0 58#define src2 r1 59#define result r0 /* Overlaps src1. */ 60 61/* Internal variables. */ 62#define tmp1 r4 63#define tmp2 r5 64#define const_m1 r12 65 66/* Additional internal variables for 64-bit aligned data. */ 67#define data1a r2 68#define data1b r3 69#define data2a r6 70#define data2b r7 71#define syndrome_a tmp1 72#define syndrome_b tmp2 73 74/* Additional internal variables for 32-bit aligned data. */ 75#define data1 r2 76#define data2 r3 77#define syndrome tmp2 78 79 80 /* Macro to compute and return the result value for word-aligned 81 cases. */ 82 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 83#ifdef __ARM_BIG_ENDIAN 84 /* If data1 contains a zero byte, then syndrome will contain a 1 in 85 bit 7 of that byte. Otherwise, the highest set bit in the 86 syndrome will highlight the first different bit. It is therefore 87 sufficient to extract the eight bits starting with the syndrome 88 bit. */ 89 clz tmp1, \synd 90 lsl r1, \d2, tmp1 91 .if \restore_r6 92 ldrd r6, r7, [sp, #8] 93 .endif 94 .cfi_restore 6 95 .cfi_restore 7 96 lsl \d1, \d1, tmp1 97 .cfi_remember_state 98 lsr result, \d1, #24 99 ldrd r4, r5, [sp], #16 100 .cfi_restore 4 101 .cfi_restore 5 102 sub result, result, r1, lsr #24 103 bx lr 104#else 105 /* To use the big-endian trick we'd have to reverse all three words. 106 that's slower than this approach. */ 107 rev \synd, \synd 108 clz tmp1, \synd 109 bic tmp1, tmp1, #7 110 lsr r1, \d2, tmp1 111 .cfi_remember_state 112 .if \restore_r6 113 ldrd r6, r7, [sp, #8] 114 .endif 115 .cfi_restore 6 116 .cfi_restore 7 117 lsr \d1, \d1, tmp1 118 and result, \d1, #255 119 and r1, r1, #255 120 ldrd r4, r5, [sp], #16 121 .cfi_restore 4 122 .cfi_restore 5 123 sub result, result, r1 124 125 bx lr 126#endif 127 .endm 128 129 .text 130 .p2align 5 131L(strcmp_start_addr): 132#if STRCMP_NO_PRECHECK == 0 133L(fastpath_exit): 134 sub r0, r2, r3 135 bx lr 136 nop 137#endif 138ENTRY_ALIGN (__strcmp_arm, 0) 139#if STRCMP_NO_PRECHECK == 0 140 ldrb r2, [src1] 141 ldrb r3, [src2] 142 cmp r2, #1 143 it cs 144 cmpcs r2, r3 145 bne L(fastpath_exit) 146#endif 147 strd r4, r5, [sp, #-16]! 148 .cfi_def_cfa_offset 16 149 .cfi_offset 4, -16 150 .cfi_offset 5, -12 151 orr tmp1, src1, src2 152 strd r6, r7, [sp, #8] 153 .cfi_offset 6, -8 154 .cfi_offset 7, -4 155 mvn const_m1, #0 156 lsl r2, tmp1, #29 157 cbz r2, L(loop_aligned8) 158 159L(not_aligned): 160 eor tmp1, src1, src2 161 tst tmp1, #7 162 bne L(misaligned8) 163 164 /* Deal with mutual misalignment by aligning downwards and then 165 masking off the unwanted loaded data to prevent a difference. */ 166 and tmp1, src1, #7 167 bic src1, src1, #7 168 and tmp2, tmp1, #3 169 bic src2, src2, #7 170 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 171 ldrd data1a, data1b, [src1], #16 172 tst tmp1, #4 173 ldrd data2a, data2b, [src2], #16 174 /* In thumb code we can't use MVN with a register shift, but 175 we do have ORN. */ 176 S2HI tmp1, const_m1, tmp2 177 orn data1a, data1a, tmp1 178 orn data2a, data2a, tmp1 179 beq L(start_realigned8) 180 orn data1b, data1b, tmp1 181 mov data1a, const_m1 182 orn data2b, data2b, tmp1 183 mov data2a, const_m1 184 b L(start_realigned8) 185 186 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 187 pass. */ 188 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 189 .p2align 2 /* Always word aligned. */ 190L(loop_aligned8): 191 ldrd data1a, data1b, [src1], #16 192 ldrd data2a, data2b, [src2], #16 193L(start_realigned8): 194 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 195 eor syndrome_a, data1a, data2a 196 sel syndrome_a, syndrome_a, const_m1 197 cbnz syndrome_a, L(diff_in_a) 198 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 199 eor syndrome_b, data1b, data2b 200 sel syndrome_b, syndrome_b, const_m1 201 cbnz syndrome_b, L(diff_in_b) 202 203 ldrd data1a, data1b, [src1, #-8] 204 ldrd data2a, data2b, [src2, #-8] 205 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 206 eor syndrome_a, data1a, data2a 207 sel syndrome_a, syndrome_a, const_m1 208 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 209 eor syndrome_b, data1b, data2b 210 sel syndrome_b, syndrome_b, const_m1 211 /* Can't use CBZ for backwards branch. */ 212 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 213 beq L(loop_aligned8) 214 215L(diff_found): 216 cbnz syndrome_a, L(diff_in_a) 217 218L(diff_in_b): 219 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 220 221L(diff_in_a): 222 .cfi_restore_state 223 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 224 225 .cfi_restore_state 226L(misaligned8): 227 tst tmp1, #3 228 bne L(misaligned4) 229 ands tmp1, src1, #3 230 bne L(mutual_align4) 231 232 /* Unrolled by a factor of 2, to reduce the number of post-increment 233 operations. */ 234L(loop_aligned4): 235 ldr data1, [src1], #8 236 ldr data2, [src2], #8 237L(start_realigned4): 238 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 239 eor syndrome, data1, data2 240 sel syndrome, syndrome, const_m1 241 cbnz syndrome, L(aligned4_done) 242 ldr data1, [src1, #-4] 243 ldr data2, [src2, #-4] 244 uadd8 syndrome, data1, const_m1 245 eor syndrome, data1, data2 246 sel syndrome, syndrome, const_m1 247 cmp syndrome, #0 248 beq L(loop_aligned4) 249 250L(aligned4_done): 251 strcmp_epilogue_aligned syndrome, data1, data2, 0 252 253L(mutual_align4): 254 .cfi_restore_state 255 /* Deal with mutual misalignment by aligning downwards and then 256 masking off the unwanted loaded data to prevent a difference. */ 257 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 258 bic src1, src1, #3 259 ldr data1, [src1], #8 260 bic src2, src2, #3 261 ldr data2, [src2], #8 262 263 /* In thumb code we can't use MVN with a register shift, but 264 we do have ORN. */ 265 S2HI tmp1, const_m1, tmp1 266 orn data1, data1, tmp1 267 orn data2, data2, tmp1 268 b L(start_realigned4) 269 270L(misaligned4): 271 ands tmp1, src1, #3 272 beq L(src1_aligned) 273 sub src2, src2, tmp1 274 bic src1, src1, #3 275 lsls tmp1, tmp1, #31 276 ldr data1, [src1], #4 277 beq L(aligned_m2) 278 bcs L(aligned_m1) 279 280#if STRCMP_NO_PRECHECK == 1 281 ldrb data2, [src2, #1] 282 uxtb tmp1, data1, ror #BYTE1_OFFSET 283 subs tmp1, tmp1, data2 284 bne L(misaligned_exit) 285 cbz data2, L(misaligned_exit) 286 287L(aligned_m2): 288 ldrb data2, [src2, #2] 289 uxtb tmp1, data1, ror #BYTE2_OFFSET 290 subs tmp1, tmp1, data2 291 bne L(misaligned_exit) 292 cbz data2, L(misaligned_exit) 293 294L(aligned_m1): 295 ldrb data2, [src2, #3] 296 uxtb tmp1, data1, ror #BYTE3_OFFSET 297 subs tmp1, tmp1, data2 298 bne L(misaligned_exit) 299 add src2, src2, #4 300 cbnz data2, L(src1_aligned) 301#else /* STRCMP_NO_PRECHECK */ 302 /* If we've done the pre-check, then we don't need to check the 303 first byte again here. */ 304 ldrb data2, [src2, #2] 305 uxtb tmp1, data1, ror #BYTE2_OFFSET 306 subs tmp1, tmp1, data2 307 bne L(misaligned_exit) 308 cbz data2, L(misaligned_exit) 309 310L(aligned_m2): 311 ldrb data2, [src2, #3] 312 uxtb tmp1, data1, ror #BYTE3_OFFSET 313 subs tmp1, tmp1, data2 314 bne L(misaligned_exit) 315 cbnz data2, L(aligned_m1) 316#endif 317 318L(misaligned_exit): 319 .cfi_remember_state 320 mov result, tmp1 321 ldr r4, [sp], #16 322 .cfi_restore 4 323 bx lr 324 325#if STRCMP_NO_PRECHECK == 0 326L(aligned_m1): 327 add src2, src2, #4 328#endif 329L(src1_aligned): 330 .cfi_restore_state 331 /* src1 is word aligned, but src2 has no common alignment 332 with it. */ 333 ldr data1, [src1], #4 334 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 335 336 bic src2, src2, #3 337 ldr data2, [src2], #4 338 bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ 339 bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ 340 341 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 342L(overlap3): 343 bic tmp1, data1, #MSB 344 uadd8 syndrome, data1, const_m1 345 eors syndrome, tmp1, data2, S2LO #8 346 sel syndrome, syndrome, const_m1 347 bne 4f 348 cbnz syndrome, 5f 349 ldr data2, [src2], #4 350 eor tmp1, tmp1, data1 351 cmp tmp1, data2, S2HI #24 352 bne 6f 353 ldr data1, [src1], #4 354 b L(overlap3) 3554: 356 S2LO data2, data2, #8 357 b L(strcmp_tail) 358 3595: 360 bics syndrome, syndrome, #MSB 361 bne L(strcmp_done_equal) 362 363 /* We can only get here if the MSB of data1 contains 0, so 364 fast-path the exit. */ 365 ldrb result, [src2] 366 .cfi_remember_state 367 ldrd r4, r5, [sp], #16 368 .cfi_restore 4 369 .cfi_restore 5 370 /* R6/7 Not used in this sequence. */ 371 .cfi_restore 6 372 .cfi_restore 7 373 neg result, result 374 bx lr 375 3766: 377 .cfi_restore_state 378 S2LO data1, data1, #24 379 and data2, data2, #LSB 380 b L(strcmp_tail) 381 382 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 383L(overlap2): 384 and tmp1, data1, const_m1, S2LO #16 385 uadd8 syndrome, data1, const_m1 386 eors syndrome, tmp1, data2, S2LO #16 387 sel syndrome, syndrome, const_m1 388 bne 4f 389 cbnz syndrome, 5f 390 ldr data2, [src2], #4 391 eor tmp1, tmp1, data1 392 cmp tmp1, data2, S2HI #16 393 bne 6f 394 ldr data1, [src1], #4 395 b L(overlap2) 3964: 397 S2LO data2, data2, #16 398 b L(strcmp_tail) 3995: 400 ands syndrome, syndrome, const_m1, S2LO #16 401 bne L(strcmp_done_equal) 402 403 ldrh data2, [src2] 404 S2LO data1, data1, #16 405#ifdef __ARM_BIG_ENDIAN 406 lsl data2, data2, #16 407#endif 408 b L(strcmp_tail) 409 4106: 411 S2LO data1, data1, #16 412 and data2, data2, const_m1, S2LO #16 413 b L(strcmp_tail) 414 415 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 416L(overlap1): 417 and tmp1, data1, #LSB 418 uadd8 syndrome, data1, const_m1 419 eors syndrome, tmp1, data2, S2LO #24 420 sel syndrome, syndrome, const_m1 421 bne 4f 422 cbnz syndrome, 5f 423 ldr data2, [src2], #4 424 eor tmp1, tmp1, data1 425 cmp tmp1, data2, S2HI #8 426 bne 6f 427 ldr data1, [src1], #4 428 b L(overlap1) 4294: 430 S2LO data2, data2, #24 431 b L(strcmp_tail) 4325: 433 tst syndrome, #LSB 434 bne L(strcmp_done_equal) 435 ldr data2, [src2] 4366: 437 S2LO data1, data1, #8 438 bic data2, data2, #MSB 439 b L(strcmp_tail) 440 441L(strcmp_done_equal): 442 mov result, #0 443 .cfi_remember_state 444 ldrd r4, r5, [sp], #16 445 .cfi_restore 4 446 .cfi_restore 5 447 /* R6/7 not used in this sequence. */ 448 .cfi_restore 6 449 .cfi_restore 7 450 bx lr 451 452L(strcmp_tail): 453 .cfi_restore_state 454#ifndef __ARM_BIG_ENDIAN 455 rev data1, data1 456 rev data2, data2 457 /* Now everything looks big-endian... */ 458#endif 459 uadd8 tmp1, data1, const_m1 460 eor tmp1, data1, data2 461 sel syndrome, tmp1, const_m1 462 clz tmp1, syndrome 463 lsl data1, data1, tmp1 464 lsl data2, data2, tmp1 465 lsr result, data1, #24 466 ldrd r4, r5, [sp], #16 467 .cfi_restore 4 468 .cfi_restore 5 469 /* R6/7 not used in this sequence. */ 470 .cfi_restore 6 471 .cfi_restore 7 472 sub result, result, data2, lsr #24 473 bx lr 474 475END (__strcmp_arm) 476 477#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ 478