1// +build !appengine 2// +build gc 3// +build !purego 4 5#include "textflag.h" 6 7// Register allocation: 8// AX h 9// CX pointer to advance through b 10// DX n 11// BX loop end 12// R8 v1, k1 13// R9 v2 14// R10 v3 15// R11 v4 16// R12 tmp 17// R13 prime1v 18// R14 prime2v 19// R15 prime4v 20 21// round reads from and advances the buffer pointer in CX. 22// It assumes that R13 has prime1v and R14 has prime2v. 23#define round(r) \ 24 MOVQ (CX), R12 \ 25 ADDQ $8, CX \ 26 IMULQ R14, R12 \ 27 ADDQ R12, r \ 28 ROLQ $31, r \ 29 IMULQ R13, r 30 31// mergeRound applies a merge round on the two registers acc and val. 32// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v. 33#define mergeRound(acc, val) \ 34 IMULQ R14, val \ 35 ROLQ $31, val \ 36 IMULQ R13, val \ 37 XORQ val, acc \ 38 IMULQ R13, acc \ 39 ADDQ R15, acc 40 41// func Sum64(b []byte) uint64 42TEXT ·Sum64(SB), NOSPLIT, $0-32 43 // Load fixed primes. 44 MOVQ ·prime1v(SB), R13 45 MOVQ ·prime2v(SB), R14 46 MOVQ ·prime4v(SB), R15 47 48 // Load slice. 49 MOVQ b_base+0(FP), CX 50 MOVQ b_len+8(FP), DX 51 LEAQ (CX)(DX*1), BX 52 53 // The first loop limit will be len(b)-32. 54 SUBQ $32, BX 55 56 // Check whether we have at least one block. 57 CMPQ DX, $32 58 JLT noBlocks 59 60 // Set up initial state (v1, v2, v3, v4). 61 MOVQ R13, R8 62 ADDQ R14, R8 63 MOVQ R14, R9 64 XORQ R10, R10 65 XORQ R11, R11 66 SUBQ R13, R11 67 68 // Loop until CX > BX. 69blockLoop: 70 round(R8) 71 round(R9) 72 round(R10) 73 round(R11) 74 75 CMPQ CX, BX 76 JLE blockLoop 77 78 MOVQ R8, AX 79 ROLQ $1, AX 80 MOVQ R9, R12 81 ROLQ $7, R12 82 ADDQ R12, AX 83 MOVQ R10, R12 84 ROLQ $12, R12 85 ADDQ R12, AX 86 MOVQ R11, R12 87 ROLQ $18, R12 88 ADDQ R12, AX 89 90 mergeRound(AX, R8) 91 mergeRound(AX, R9) 92 mergeRound(AX, R10) 93 mergeRound(AX, R11) 94 95 JMP afterBlocks 96 97noBlocks: 98 MOVQ ·prime5v(SB), AX 99 100afterBlocks: 101 ADDQ DX, AX 102 103 // Right now BX has len(b)-32, and we want to loop until CX > len(b)-8. 104 ADDQ $24, BX 105 106 CMPQ CX, BX 107 JG fourByte 108 109wordLoop: 110 // Calculate k1. 111 MOVQ (CX), R8 112 ADDQ $8, CX 113 IMULQ R14, R8 114 ROLQ $31, R8 115 IMULQ R13, R8 116 117 XORQ R8, AX 118 ROLQ $27, AX 119 IMULQ R13, AX 120 ADDQ R15, AX 121 122 CMPQ CX, BX 123 JLE wordLoop 124 125fourByte: 126 ADDQ $4, BX 127 CMPQ CX, BX 128 JG singles 129 130 MOVL (CX), R8 131 ADDQ $4, CX 132 IMULQ R13, R8 133 XORQ R8, AX 134 135 ROLQ $23, AX 136 IMULQ R14, AX 137 ADDQ ·prime3v(SB), AX 138 139singles: 140 ADDQ $4, BX 141 CMPQ CX, BX 142 JGE finalize 143 144singlesLoop: 145 MOVBQZX (CX), R12 146 ADDQ $1, CX 147 IMULQ ·prime5v(SB), R12 148 XORQ R12, AX 149 150 ROLQ $11, AX 151 IMULQ R13, AX 152 153 CMPQ CX, BX 154 JL singlesLoop 155 156finalize: 157 MOVQ AX, R12 158 SHRQ $33, R12 159 XORQ R12, AX 160 IMULQ R14, AX 161 MOVQ AX, R12 162 SHRQ $29, R12 163 XORQ R12, AX 164 IMULQ ·prime3v(SB), AX 165 MOVQ AX, R12 166 SHRQ $32, R12 167 XORQ R12, AX 168 169 MOVQ AX, ret+24(FP) 170 RET 171 172// writeBlocks uses the same registers as above except that it uses AX to store 173// the x pointer. 174 175// func writeBlocks(x *xxh, b []byte) []byte 176TEXT ·writeBlocks(SB), NOSPLIT, $0-56 177 // Load fixed primes needed for round. 178 MOVQ ·prime1v(SB), R13 179 MOVQ ·prime2v(SB), R14 180 181 // Load slice. 182 MOVQ b_base+8(FP), CX 183 MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below 184 MOVQ b_len+16(FP), DX 185 LEAQ (CX)(DX*1), BX 186 SUBQ $32, BX 187 188 // Load vN from x. 189 MOVQ x+0(FP), AX 190 MOVQ 0(AX), R8 // v1 191 MOVQ 8(AX), R9 // v2 192 MOVQ 16(AX), R10 // v3 193 MOVQ 24(AX), R11 // v4 194 195 // We don't need to check the loop condition here; this function is 196 // always called with at least one block of data to process. 197blockLoop: 198 round(R8) 199 round(R9) 200 round(R10) 201 round(R11) 202 203 CMPQ CX, BX 204 JLE blockLoop 205 206 // Copy vN back to x. 207 MOVQ R8, 0(AX) 208 MOVQ R9, 8(AX) 209 MOVQ R10, 16(AX) 210 MOVQ R11, 24(AX) 211 212 // Construct return slice. 213 // NOTE: It's important that we don't construct a slice that has a base 214 // pointer off the end of the original slice, as in Go 1.7+ this will 215 // cause runtime crashes. (See discussion in, for example, 216 // https://github.com/golang/go/issues/16772.) 217 // Therefore, we calculate the length/cap first, and if they're zero, we 218 // keep the old base. This is what the compiler does as well if you 219 // write code like 220 // b = b[len(b):] 221 222 // New length is 32 - (CX - BX) -> BX+32 - CX. 223 ADDQ $32, BX 224 SUBQ CX, BX 225 JZ afterSetBase 226 227 MOVQ CX, ret_base+32(FP) 228 229afterSetBase: 230 MOVQ BX, ret_len+40(FP) 231 MOVQ BX, ret_cap+48(FP) // set cap == len 232 233 RET 234