1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-memchr.S 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Finds characters in a memory area. Optimized for the Alpha: 81da177e4SLinus Torvalds * 91da177e4SLinus Torvalds * - memory accessed as aligned quadwords only 101da177e4SLinus Torvalds * - uses cmpbge to compare 8 bytes in parallel 111da177e4SLinus Torvalds * - does binary search to find 0 byte in last 121da177e4SLinus Torvalds * quadword (HAKMEM needed 12 instructions to 131da177e4SLinus Torvalds * do this instead of the 9 instructions that 141da177e4SLinus Torvalds * binary search needs). 151da177e4SLinus Torvalds * 161da177e4SLinus Torvalds * For correctness consider that: 171da177e4SLinus Torvalds * 181da177e4SLinus Torvalds * - only minimum number of quadwords may be accessed 191da177e4SLinus Torvalds * - the third argument is an unsigned long 201da177e4SLinus Torvalds * 211da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 221da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 231da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 241da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 251da177e4SLinus Torvalds * Scheduling notation: 261da177e4SLinus Torvalds * E - either cluster 271da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 281da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 291da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency. 301da177e4SLinus Torvalds */ 31*f3c78e94SMasahiro Yamada#include <linux/export.h> 321da177e4SLinus Torvalds .set noreorder 331da177e4SLinus Torvalds .set noat 341da177e4SLinus Torvalds 351da177e4SLinus Torvalds .align 4 361da177e4SLinus Torvalds .globl memchr 371da177e4SLinus Torvalds .ent memchr 381da177e4SLinus Torvaldsmemchr: 391da177e4SLinus Torvalds .frame $30,0,$26,0 401da177e4SLinus Torvalds .prologue 0 411da177e4SLinus Torvalds 421da177e4SLinus Torvalds # Hack -- if someone passes in (size_t)-1, hoping to just 431da177e4SLinus Torvalds # search til the end of the address space, we will overflow 441da177e4SLinus Torvalds # below when we find the address of the last byte. Given 451da177e4SLinus Torvalds # that we will never have a 56-bit address space, cropping 461da177e4SLinus Torvalds # the length is the easiest way to avoid trouble. 471da177e4SLinus Torvalds zap $18, 0x80, $5 # U : Bound length 481da177e4SLinus Torvalds beq $18, $not_found # U : 491da177e4SLinus Torvalds ldq_u $1, 0($16) # L : load first quadword Latency=3 501da177e4SLinus Torvalds and $17, 0xff, $17 # E : L L U U : 00000000000000ch 511da177e4SLinus Torvalds 521da177e4SLinus Torvalds insbl $17, 1, $2 # U : 000000000000ch00 531da177e4SLinus Torvalds cmpult $18, 9, $4 # E : small (< 1 quad) string? 541da177e4SLinus Torvalds or $2, $17, $17 # E : 000000000000chch 551da177e4SLinus Torvalds lda $3, -1($31) # E : U L L U 561da177e4SLinus Torvalds 571da177e4SLinus Torvalds sll $17, 16, $2 # U : 00000000chch0000 581da177e4SLinus Torvalds addq $16, $5, $5 # E : Max search address 591da177e4SLinus Torvalds or $2, $17, $17 # E : 00000000chchchch 601da177e4SLinus Torvalds sll $17, 32, $2 # U : U L L U : chchchch00000000 611da177e4SLinus Torvalds 621da177e4SLinus Torvalds or $2, $17, $17 # E : chchchchchchchch 631da177e4SLinus Torvalds extql $1, $16, $7 # U : $7 is upper bits 641da177e4SLinus Torvalds beq $4, $first_quad # U : 651da177e4SLinus Torvalds ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3 661da177e4SLinus Torvalds 671da177e4SLinus Torvalds extqh $6, $16, $6 # U : 2 cycle stall for $6 681da177e4SLinus Torvalds mov $16, $0 # E : 691da177e4SLinus Torvalds nop # E : 701da177e4SLinus Torvalds or $7, $6, $1 # E : L U L U $1 = quadword starting at $16 711da177e4SLinus Torvalds 721da177e4SLinus Torvalds # Deal with the case where at most 8 bytes remain to be searched 731da177e4SLinus Torvalds # in $1. E.g.: 741da177e4SLinus Torvalds # $18 = 6 751da177e4SLinus Torvalds # $1 = ????c6c5c4c3c2c1 761da177e4SLinus Torvalds$last_quad: 771da177e4SLinus Torvalds negq $18, $6 # E : 781da177e4SLinus Torvalds xor $17, $1, $1 # E : 791da177e4SLinus Torvalds srl $3, $6, $6 # U : $6 = mask of $18 bits set 801da177e4SLinus Torvalds cmpbge $31, $1, $2 # E : L U L U 811da177e4SLinus Torvalds 821da177e4SLinus Torvalds nop 831da177e4SLinus Torvalds nop 841da177e4SLinus Torvalds and $2, $6, $2 # E : 851da177e4SLinus Torvalds beq $2, $not_found # U : U L U L 861da177e4SLinus Torvalds 871da177e4SLinus Torvalds$found_it: 884b417d0cSAkinobu Mita#ifdef CONFIG_ALPHA_EV67 891da177e4SLinus Torvalds /* 901da177e4SLinus Torvalds * Since we are guaranteed to have set one of the bits, we don't 911da177e4SLinus Torvalds * have to worry about coming back with a 0x40 out of cttz... 921da177e4SLinus Torvalds */ 931da177e4SLinus Torvalds cttz $2, $3 # U0 : 941da177e4SLinus Torvalds addq $0, $3, $0 # E : All done 951da177e4SLinus Torvalds nop # E : 961da177e4SLinus Torvalds ret # L0 : L U L U 971da177e4SLinus Torvalds#else 981da177e4SLinus Torvalds /* 991da177e4SLinus Torvalds * Slow and clunky. It can probably be improved. 1001da177e4SLinus Torvalds * An exercise left for others. 1011da177e4SLinus Torvalds */ 1021da177e4SLinus Torvalds negq $2, $3 # E : 1031da177e4SLinus Torvalds and $2, $3, $2 # E : 1041da177e4SLinus Torvalds and $2, 0x0f, $1 # E : 1051da177e4SLinus Torvalds addq $0, 4, $3 # E : 1061da177e4SLinus Torvalds 1071da177e4SLinus Torvalds cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 1081da177e4SLinus Torvalds nop # E : keep with cmov 1091da177e4SLinus Torvalds and $2, 0x33, $1 # E : 1101da177e4SLinus Torvalds addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0 1111da177e4SLinus Torvalds 1121da177e4SLinus Torvalds cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 1131da177e4SLinus Torvalds nop # E : keep with cmov 1141da177e4SLinus Torvalds and $2, 0x55, $1 # E : 1151da177e4SLinus Torvalds addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0 1161da177e4SLinus Torvalds 1171da177e4SLinus Torvalds cmoveq $1, $3, $0 # E : Latency 2, extra map cycle 1181da177e4SLinus Torvalds nop 1191da177e4SLinus Torvalds nop 1201da177e4SLinus Torvalds ret # L0 : L U L U 1211da177e4SLinus Torvalds#endif 1221da177e4SLinus Torvalds 1231da177e4SLinus Torvalds # Deal with the case where $18 > 8 bytes remain to be 1241da177e4SLinus Torvalds # searched. $16 may not be aligned. 1251da177e4SLinus Torvalds .align 4 1261da177e4SLinus Torvalds$first_quad: 1271da177e4SLinus Torvalds andnot $16, 0x7, $0 # E : 1281da177e4SLinus Torvalds insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff) 1291da177e4SLinus Torvalds xor $1, $17, $1 # E : 1301da177e4SLinus Torvalds or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff 1311da177e4SLinus Torvalds 1321da177e4SLinus Torvalds cmpbge $31, $1, $2 # E : 1331da177e4SLinus Torvalds bne $2, $found_it # U : 1341da177e4SLinus Torvalds # At least one byte left to process. 1351da177e4SLinus Torvalds ldq $1, 8($0) # L : 1361da177e4SLinus Torvalds subq $5, 1, $18 # E : U L U L 1371da177e4SLinus Torvalds 1381da177e4SLinus Torvalds addq $0, 8, $0 # E : 1391da177e4SLinus Torvalds # Make $18 point to last quad to be accessed (the 1401da177e4SLinus Torvalds # last quad may or may not be partial). 1411da177e4SLinus Torvalds andnot $18, 0x7, $18 # E : 1421da177e4SLinus Torvalds cmpult $0, $18, $2 # E : 1431da177e4SLinus Torvalds beq $2, $final # U : U L U L 1441da177e4SLinus Torvalds 1451da177e4SLinus Torvalds # At least two quads remain to be accessed. 1461da177e4SLinus Torvalds 1471da177e4SLinus Torvalds subq $18, $0, $4 # E : $4 <- nr quads to be processed 1481da177e4SLinus Torvalds and $4, 8, $4 # E : odd number of quads? 1491da177e4SLinus Torvalds bne $4, $odd_quad_count # U : 1501da177e4SLinus Torvalds # At least three quads remain to be accessed 1511da177e4SLinus Torvalds mov $1, $4 # E : L U L U : move prefetched value to correct reg 1521da177e4SLinus Torvalds 1531da177e4SLinus Torvalds .align 4 1541da177e4SLinus Torvalds$unrolled_loop: 1551da177e4SLinus Torvalds ldq $1, 8($0) # L : prefetch $1 1561da177e4SLinus Torvalds xor $17, $4, $2 # E : 1571da177e4SLinus Torvalds cmpbge $31, $2, $2 # E : 1581da177e4SLinus Torvalds bne $2, $found_it # U : U L U L 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds addq $0, 8, $0 # E : 1611da177e4SLinus Torvalds nop # E : 1621da177e4SLinus Torvalds nop # E : 1631da177e4SLinus Torvalds nop # E : 1641da177e4SLinus Torvalds 1651da177e4SLinus Torvalds$odd_quad_count: 1661da177e4SLinus Torvalds xor $17, $1, $2 # E : 1671da177e4SLinus Torvalds ldq $4, 8($0) # L : prefetch $4 1681da177e4SLinus Torvalds cmpbge $31, $2, $2 # E : 1691da177e4SLinus Torvalds addq $0, 8, $6 # E : 1701da177e4SLinus Torvalds 1711da177e4SLinus Torvalds bne $2, $found_it # U : 1721da177e4SLinus Torvalds cmpult $6, $18, $6 # E : 1731da177e4SLinus Torvalds addq $0, 8, $0 # E : 1741da177e4SLinus Torvalds nop # E : 1751da177e4SLinus Torvalds 1761da177e4SLinus Torvalds bne $6, $unrolled_loop # U : 1771da177e4SLinus Torvalds mov $4, $1 # E : move prefetched value into $1 1781da177e4SLinus Torvalds nop # E : 1791da177e4SLinus Torvalds nop # E : 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do 1821da177e4SLinus Torvalds nop # E : 1831da177e4SLinus Torvalds nop # E : 1841da177e4SLinus Torvalds bne $18, $last_quad # U : 1851da177e4SLinus Torvalds 1861da177e4SLinus Torvalds$not_found: 1871da177e4SLinus Torvalds mov $31, $0 # E : 1881da177e4SLinus Torvalds nop # E : 1891da177e4SLinus Torvalds nop # E : 1901da177e4SLinus Torvalds ret # L0 : 1911da177e4SLinus Torvalds 1921da177e4SLinus Torvalds .end memchr 19300fc0e0dSAl Viro EXPORT_SYMBOL(memchr) 194