1/* 2 * Copyright (c) 2012-2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* Implementation of strcmp for ARMv7 when DSP instructions are 30 available. Use ldrd to support wider loads, provided the data 31 is sufficiently aligned. Use saturating arithmetic to optimize 32 the compares. */ 33 34 /* Build Options: 35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 36 byte in the string. If comparing completely random strings 37 the pre-check will save time, since there is a very high 38 probability of a mismatch in the first character: we save 39 significant overhead if this is the common case. However, 40 if strings are likely to be identical (eg because we're 41 verifying a hit in a hash table), then this check is largely 42 redundant. */ 43 44 /* This version uses Thumb-2 code. */ 45 .thumb 46 .syntax unified 47 48/* Parameters and result. */ 49#define src1 r0 50#define src2 r1 51#define result r0 /* Overlaps src1. */ 52 53/* Internal variables. */ 54#define tmp1 r4 55#define tmp2 r5 56#define const_m1 r12 57 58/* Additional internal variables for 64-bit aligned data. */ 59#define data1a r2 60#define data1b r3 61#define data2a r6 62#define data2b r7 63#define syndrome_a tmp1 64#define syndrome_b tmp2 65 66/* Additional internal variables for 32-bit aligned data. */ 67#define data1 r2 68#define data2 r3 69#define syndrome tmp2 70 71 72 /* Macro to compute and return the result value for word-aligned 73 cases. */ 74 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 75#ifdef __ARM_BIG_ENDIAN 76 /* If data1 contains a zero byte, then syndrome will contain a 1 in 77 bit 7 of that byte. Otherwise, the highest set bit in the 78 syndrome will highlight the first different bit. It is therefore 79 sufficient to extract the eight bits starting with the syndrome 80 bit. */ 81 clz tmp1, \synd 82 lsl r1, \d2, tmp1 83 .if \restore_r6 84 ldrd r6, r7, [sp, #8] 85 .endif 86 .cfi_restore 6 87 .cfi_restore 7 88 lsl \d1, \d1, tmp1 89 .cfi_remember_state 90 lsr result, \d1, #24 91 ldrd r4, r5, [sp], #16 92 .cfi_restore 4 93 .cfi_restore 5 94 sub result, result, r1, lsr #24 95 bx lr 96#else 97 /* To use the big-endian trick we'd have to reverse all three words. 98 that's slower than this approach. */ 99 rev \synd, \synd 100 clz tmp1, \synd 101 bic tmp1, tmp1, #7 102 lsr r1, \d2, tmp1 103 .cfi_remember_state 104 .if \restore_r6 105 ldrd r6, r7, [sp, #8] 106 .endif 107 .cfi_restore 6 108 .cfi_restore 7 109 lsr \d1, \d1, tmp1 110 and result, \d1, #255 111 and r1, r1, #255 112 ldrd r4, r5, [sp], #16 113 .cfi_restore 4 114 .cfi_restore 5 115 sub result, result, r1 116 117 bx lr 118#endif 119 .endm 120 121 .text 122 .p2align 5 123.Lstrcmp_start_addr: 124#ifndef STRCMP_NO_PRECHECK 125.Lfastpath_exit: 126 sub r0, r2, r3 127 bx lr 128 nop 129#endif 130def_fn strcmp 131#ifndef STRCMP_NO_PRECHECK 132 ldrb r2, [src1] 133 ldrb r3, [src2] 134 cmp r2, #1 135 it cs 136 cmpcs r2, r3 137 bne .Lfastpath_exit 138#endif 139 .cfi_sections .debug_frame 140 .cfi_startproc 141 strd r4, r5, [sp, #-16]! 142 .cfi_def_cfa_offset 16 143 .cfi_offset 4, -16 144 .cfi_offset 5, -12 145 orr tmp1, src1, src2 146 strd r6, r7, [sp, #8] 147 .cfi_offset 6, -8 148 .cfi_offset 7, -4 149 mvn const_m1, #0 150 lsl r2, tmp1, #29 151 cbz r2, .Lloop_aligned8 152 153.Lnot_aligned: 154 eor tmp1, src1, src2 155 tst tmp1, #7 156 bne .Lmisaligned8 157 158 /* Deal with mutual misalignment by aligning downwards and then 159 masking off the unwanted loaded data to prevent a difference. */ 160 and tmp1, src1, #7 161 bic src1, src1, #7 162 and tmp2, tmp1, #3 163 bic src2, src2, #7 164 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 165 ldrd data1a, data1b, [src1], #16 166 tst tmp1, #4 167 ldrd data2a, data2b, [src2], #16 168 /* In thumb code we can't use MVN with a register shift, but 169 we do have ORN. */ 170 S2HI tmp1, const_m1, tmp2 171 orn data1a, data1a, tmp1 172 orn data2a, data2a, tmp1 173 beq .Lstart_realigned8 174 orn data1b, data1b, tmp1 175 mov data1a, const_m1 176 orn data2b, data2b, tmp1 177 mov data2a, const_m1 178 b .Lstart_realigned8 179 180 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 181 pass. */ 182 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 183 .p2align 2 /* Always word aligned. */ 184.Lloop_aligned8: 185 ldrd data1a, data1b, [src1], #16 186 ldrd data2a, data2b, [src2], #16 187.Lstart_realigned8: 188 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 189 eor syndrome_a, data1a, data2a 190 sel syndrome_a, syndrome_a, const_m1 191 cbnz syndrome_a, .Ldiff_in_a 192 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 193 eor syndrome_b, data1b, data2b 194 sel syndrome_b, syndrome_b, const_m1 195 cbnz syndrome_b, .Ldiff_in_b 196 197 ldrd data1a, data1b, [src1, #-8] 198 ldrd data2a, data2b, [src2, #-8] 199 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 200 eor syndrome_a, data1a, data2a 201 sel syndrome_a, syndrome_a, const_m1 202 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 203 eor syndrome_b, data1b, data2b 204 sel syndrome_b, syndrome_b, const_m1 205 /* Can't use CBZ for backwards branch. */ 206 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 207 beq .Lloop_aligned8 208 209.Ldiff_found: 210 cbnz syndrome_a, .Ldiff_in_a 211 212.Ldiff_in_b: 213 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 214 215.Ldiff_in_a: 216 .cfi_restore_state 217 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 218 219 .cfi_restore_state 220.Lmisaligned8: 221 tst tmp1, #3 222 bne .Lmisaligned4 223 ands tmp1, src1, #3 224 bne .Lmutual_align4 225 226 /* Unrolled by a factor of 2, to reduce the number of post-increment 227 operations. */ 228.Lloop_aligned4: 229 ldr data1, [src1], #8 230 ldr data2, [src2], #8 231.Lstart_realigned4: 232 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 233 eor syndrome, data1, data2 234 sel syndrome, syndrome, const_m1 235 cbnz syndrome, .Laligned4_done 236 ldr data1, [src1, #-4] 237 ldr data2, [src2, #-4] 238 uadd8 syndrome, data1, const_m1 239 eor syndrome, data1, data2 240 sel syndrome, syndrome, const_m1 241 cmp syndrome, #0 242 beq .Lloop_aligned4 243 244.Laligned4_done: 245 strcmp_epilogue_aligned syndrome, data1, data2, 0 246 247.Lmutual_align4: 248 .cfi_restore_state 249 /* Deal with mutual misalignment by aligning downwards and then 250 masking off the unwanted loaded data to prevent a difference. */ 251 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 252 bic src1, src1, #3 253 ldr data1, [src1], #8 254 bic src2, src2, #3 255 ldr data2, [src2], #8 256 257 /* In thumb code we can't use MVN with a register shift, but 258 we do have ORN. */ 259 S2HI tmp1, const_m1, tmp1 260 orn data1, data1, tmp1 261 orn data2, data2, tmp1 262 b .Lstart_realigned4 263 264.Lmisaligned4: 265 ands tmp1, src1, #3 266 beq .Lsrc1_aligned 267 sub src2, src2, tmp1 268 bic src1, src1, #3 269 lsls tmp1, tmp1, #31 270 ldr data1, [src1], #4 271 beq .Laligned_m2 272 bcs .Laligned_m1 273 274#ifdef STRCMP_NO_PRECHECK 275 ldrb data2, [src2, #1] 276 uxtb tmp1, data1, ror #BYTE1_OFFSET 277 subs tmp1, tmp1, data2 278 bne .Lmisaligned_exit 279 cbz data2, .Lmisaligned_exit 280 281.Laligned_m2: 282 ldrb data2, [src2, #2] 283 uxtb tmp1, data1, ror #BYTE2_OFFSET 284 subs tmp1, tmp1, data2 285 bne .Lmisaligned_exit 286 cbz data2, .Lmisaligned_exit 287 288.Laligned_m1: 289 ldrb data2, [src2, #3] 290 uxtb tmp1, data1, ror #BYTE3_OFFSET 291 subs tmp1, tmp1, data2 292 bne .Lmisaligned_exit 293 add src2, src2, #4 294 cbnz data2, .Lsrc1_aligned 295#else /* STRCMP_NO_PRECHECK */ 296 /* If we've done the pre-check, then we don't need to check the 297 first byte again here. */ 298 ldrb data2, [src2, #2] 299 uxtb tmp1, data1, ror #BYTE2_OFFSET 300 subs tmp1, tmp1, data2 301 bne .Lmisaligned_exit 302 cbz data2, .Lmisaligned_exit 303 304.Laligned_m2: 305 ldrb data2, [src2, #3] 306 uxtb tmp1, data1, ror #BYTE3_OFFSET 307 subs tmp1, tmp1, data2 308 bne .Lmisaligned_exit 309 cbnz data2, .Laligned_m1 310#endif 311 312.Lmisaligned_exit: 313 .cfi_remember_state 314 mov result, tmp1 315 ldr r4, [sp], #16 316 .cfi_restore 4 317 bx lr 318 319#ifndef STRCMP_NO_PRECHECK 320.Laligned_m1: 321 add src2, src2, #4 322#endif 323.Lsrc1_aligned: 324 .cfi_restore_state 325 /* src1 is word aligned, but src2 has no common alignment 326 with it. */ 327 ldr data1, [src1], #4 328 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 329 330 bic src2, src2, #3 331 ldr data2, [src2], #4 332 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 333 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 334 335 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 336.Loverlap3: 337 bic tmp1, data1, #MSB 338 uadd8 syndrome, data1, const_m1 339 eors syndrome, tmp1, data2, S2LO #8 340 sel syndrome, syndrome, const_m1 341 bne 4f 342 cbnz syndrome, 5f 343 ldr data2, [src2], #4 344 eor tmp1, tmp1, data1 345 cmp tmp1, data2, S2HI #24 346 bne 6f 347 ldr data1, [src1], #4 348 b .Loverlap3 3494: 350 S2LO data2, data2, #8 351 b .Lstrcmp_tail 352 3535: 354 bics syndrome, syndrome, #MSB 355 bne .Lstrcmp_done_equal 356 357 /* We can only get here if the MSB of data1 contains 0, so 358 fast-path the exit. */ 359 ldrb result, [src2] 360 .cfi_remember_state 361 ldrd r4, r5, [sp], #16 362 .cfi_restore 4 363 .cfi_restore 5 364 /* R6/7 Not used in this sequence. */ 365 .cfi_restore 6 366 .cfi_restore 7 367 neg result, result 368 bx lr 369 3706: 371 .cfi_restore_state 372 S2LO data1, data1, #24 373 and data2, data2, #LSB 374 b .Lstrcmp_tail 375 376 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 377.Loverlap2: 378 and tmp1, data1, const_m1, S2LO #16 379 uadd8 syndrome, data1, const_m1 380 eors syndrome, tmp1, data2, S2LO #16 381 sel syndrome, syndrome, const_m1 382 bne 4f 383 cbnz syndrome, 5f 384 ldr data2, [src2], #4 385 eor tmp1, tmp1, data1 386 cmp tmp1, data2, S2HI #16 387 bne 6f 388 ldr data1, [src1], #4 389 b .Loverlap2 3904: 391 S2LO data2, data2, #16 392 b .Lstrcmp_tail 3935: 394 ands syndrome, syndrome, const_m1, S2LO #16 395 bne .Lstrcmp_done_equal 396 397 ldrh data2, [src2] 398 S2LO data1, data1, #16 399#ifdef __ARM_BIG_ENDIAN 400 lsl data2, data2, #16 401#endif 402 b .Lstrcmp_tail 403 4046: 405 S2LO data1, data1, #16 406 and data2, data2, const_m1, S2LO #16 407 b .Lstrcmp_tail 408 409 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 410.Loverlap1: 411 and tmp1, data1, #LSB 412 uadd8 syndrome, data1, const_m1 413 eors syndrome, tmp1, data2, S2LO #24 414 sel syndrome, syndrome, const_m1 415 bne 4f 416 cbnz syndrome, 5f 417 ldr data2, [src2], #4 418 eor tmp1, tmp1, data1 419 cmp tmp1, data2, S2HI #8 420 bne 6f 421 ldr data1, [src1], #4 422 b .Loverlap1 4234: 424 S2LO data2, data2, #24 425 b .Lstrcmp_tail 4265: 427 tst syndrome, #LSB 428 bne .Lstrcmp_done_equal 429 ldr data2, [src2] 4306: 431 S2LO data1, data1, #8 432 bic data2, data2, #MSB 433 b .Lstrcmp_tail 434 435.Lstrcmp_done_equal: 436 mov result, #0 437 .cfi_remember_state 438 ldrd r4, r5, [sp], #16 439 .cfi_restore 4 440 .cfi_restore 5 441 /* R6/7 not used in this sequence. */ 442 .cfi_restore 6 443 .cfi_restore 7 444 bx lr 445 446.Lstrcmp_tail: 447 .cfi_restore_state 448#ifndef __ARM_BIG_ENDIAN 449 rev data1, data1 450 rev data2, data2 451 /* Now everything looks big-endian... */ 452#endif 453 uadd8 tmp1, data1, const_m1 454 eor tmp1, data1, data2 455 sel syndrome, tmp1, const_m1 456 clz tmp1, syndrome 457 lsl data1, data1, tmp1 458 lsl data2, data2, tmp1 459 lsr result, data1, #24 460 ldrd r4, r5, [sp], #16 461 .cfi_restore 4 462 .cfi_restore 5 463 /* R6/7 not used in this sequence. */ 464 .cfi_restore 6 465 .cfi_restore 7 466 sub result, result, data2, lsr #24 467 bx lr 468 .cfi_endproc 469 .size strcmp, . - .Lstrcmp_start_addr 470