1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !appengine 6// +build gc 7// +build !noasm 8 9#include "textflag.h" 10 11// The asm code generally follows the pure Go code in decode_other.go, except 12// where marked with a "!!!". 13 14// func decode(dst, src []byte) int 15// 16// All local variables fit into registers. The non-zero stack size is only to 17// spill registers and push args when issuing a CALL. The register allocation: 18// - AX scratch 19// - BX scratch 20// - CX length or x 21// - DX offset 22// - SI &src[s] 23// - DI &dst[d] 24// + R8 dst_base 25// + R9 dst_len 26// + R10 dst_base + dst_len 27// + R11 src_base 28// + R12 src_len 29// + R13 src_base + src_len 30// - R14 used by doCopy 31// - R15 used by doCopy 32// 33// The registers R8-R13 (marked with a "+") are set at the start of the 34// function, and after a CALL returns, and are not otherwise modified. 35// 36// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. 37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. 38TEXT ·decode(SB), NOSPLIT, $48-56 39 // Initialize SI, DI and R8-R13. 40 MOVQ dst_base+0(FP), R8 41 MOVQ dst_len+8(FP), R9 42 MOVQ R8, DI 43 MOVQ R8, R10 44 ADDQ R9, R10 45 MOVQ src_base+24(FP), R11 46 MOVQ src_len+32(FP), R12 47 MOVQ R11, SI 48 MOVQ R11, R13 49 ADDQ R12, R13 50 51loop: 52 // for s < len(src) 53 CMPQ SI, R13 54 JEQ end 55 56 // CX = uint32(src[s]) 57 // 58 // switch src[s] & 0x03 59 MOVBLZX (SI), CX 60 MOVL CX, BX 61 ANDL $3, BX 62 CMPL BX, $1 63 JAE tagCopy 64 65 // ---------------------------------------- 66 // The code below handles literal tags. 67 68 // case tagLiteral: 69 // x := uint32(src[s] >> 2) 70 // switch 71 SHRL $2, CX 72 CMPL CX, $60 73 JAE tagLit60Plus 74 75 // case x < 60: 76 // s++ 77 INCQ SI 78 79doLit: 80 // This is the end of the inner "switch", when we have a literal tag. 81 // 82 // We assume that CX == x and x fits in a uint32, where x is the variable 83 // used in the pure Go decode_other.go code. 84 85 // length = int(x) + 1 86 // 87 // Unlike the pure Go code, we don't need to check if length <= 0 because 88 // CX can hold 64 bits, so the increment cannot overflow. 89 INCQ CX 90 91 // Prepare to check if copying length bytes will run past the end of dst or 92 // src. 93 // 94 // AX = len(dst) - d 95 // BX = len(src) - s 96 MOVQ R10, AX 97 SUBQ DI, AX 98 MOVQ R13, BX 99 SUBQ SI, BX 100 101 // !!! Try a faster technique for short (16 or fewer bytes) copies. 102 // 103 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { 104 // goto callMemmove // Fall back on calling runtime·memmove. 105 // } 106 // 107 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s 108 // against 21 instead of 16, because it cannot assume that all of its input 109 // is contiguous in memory and so it needs to leave enough source bytes to 110 // read the next tag without refilling buffers, but Go's Decode assumes 111 // contiguousness (the src argument is a []byte). 112 CMPQ CX, $16 113 JGT callMemmove 114 CMPQ AX, $16 115 JLT callMemmove 116 CMPQ BX, $16 117 JLT callMemmove 118 119 // !!! Implement the copy from src to dst as a 16-byte load and store. 120 // (Decode's documentation says that dst and src must not overlap.) 121 // 122 // This always copies 16 bytes, instead of only length bytes, but that's 123 // OK. If the input is a valid Snappy encoding then subsequent iterations 124 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a 125 // non-nil error), so the overrun will be ignored. 126 // 127 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or 128 // 16-byte loads and stores. This technique probably wouldn't be as 129 // effective on architectures that are fussier about alignment. 130 MOVOU 0(SI), X0 131 MOVOU X0, 0(DI) 132 133 // d += length 134 // s += length 135 ADDQ CX, DI 136 ADDQ CX, SI 137 JMP loop 138 139callMemmove: 140 // if length > len(dst)-d || length > len(src)-s { etc } 141 CMPQ CX, AX 142 JGT errCorrupt 143 CMPQ CX, BX 144 JGT errCorrupt 145 146 // copy(dst[d:], src[s:s+length]) 147 // 148 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push 149 // DI, SI and CX as arguments. Coincidentally, we also need to spill those 150 // three registers to the stack, to save local variables across the CALL. 151 MOVQ DI, 0(SP) 152 MOVQ SI, 8(SP) 153 MOVQ CX, 16(SP) 154 MOVQ DI, 24(SP) 155 MOVQ SI, 32(SP) 156 MOVQ CX, 40(SP) 157 CALL runtime·memmove(SB) 158 159 // Restore local variables: unspill registers from the stack and 160 // re-calculate R8-R13. 161 MOVQ 24(SP), DI 162 MOVQ 32(SP), SI 163 MOVQ 40(SP), CX 164 MOVQ dst_base+0(FP), R8 165 MOVQ dst_len+8(FP), R9 166 MOVQ R8, R10 167 ADDQ R9, R10 168 MOVQ src_base+24(FP), R11 169 MOVQ src_len+32(FP), R12 170 MOVQ R11, R13 171 ADDQ R12, R13 172 173 // d += length 174 // s += length 175 ADDQ CX, DI 176 ADDQ CX, SI 177 JMP loop 178 179tagLit60Plus: 180 // !!! This fragment does the 181 // 182 // s += x - 58; if uint(s) > uint(len(src)) { etc } 183 // 184 // checks. In the asm version, we code it once instead of once per switch case. 185 ADDQ CX, SI 186 SUBQ $58, SI 187 CMPQ SI, R13 188 JA errCorrupt 189 190 // case x == 60: 191 CMPL CX, $61 192 JEQ tagLit61 193 JA tagLit62Plus 194 195 // x = uint32(src[s-1]) 196 MOVBLZX -1(SI), CX 197 JMP doLit 198 199tagLit61: 200 // case x == 61: 201 // x = uint32(src[s-2]) | uint32(src[s-1])<<8 202 MOVWLZX -2(SI), CX 203 JMP doLit 204 205tagLit62Plus: 206 CMPL CX, $62 207 JA tagLit63 208 209 // case x == 62: 210 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 211 MOVWLZX -3(SI), CX 212 MOVBLZX -1(SI), BX 213 SHLL $16, BX 214 ORL BX, CX 215 JMP doLit 216 217tagLit63: 218 // case x == 63: 219 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 220 MOVL -4(SI), CX 221 JMP doLit 222 223// The code above handles literal tags. 224// ---------------------------------------- 225// The code below handles copy tags. 226 227tagCopy4: 228 // case tagCopy4: 229 // s += 5 230 ADDQ $5, SI 231 232 // if uint(s) > uint(len(src)) { etc } 233 CMPQ SI, R13 234 JA errCorrupt 235 236 // length = 1 + int(src[s-5])>>2 237 SHRQ $2, CX 238 INCQ CX 239 240 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) 241 MOVLQZX -4(SI), DX 242 JMP doCopy 243 244tagCopy2: 245 // case tagCopy2: 246 // s += 3 247 ADDQ $3, SI 248 249 // if uint(s) > uint(len(src)) { etc } 250 CMPQ SI, R13 251 JA errCorrupt 252 253 // length = 1 + int(src[s-3])>>2 254 SHRQ $2, CX 255 INCQ CX 256 257 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) 258 MOVWQZX -2(SI), DX 259 JMP doCopy 260 261tagCopy: 262 // We have a copy tag. We assume that: 263 // - BX == src[s] & 0x03 264 // - CX == src[s] 265 CMPQ BX, $2 266 JEQ tagCopy2 267 JA tagCopy4 268 269 // case tagCopy1: 270 // s += 2 271 ADDQ $2, SI 272 273 // if uint(s) > uint(len(src)) { etc } 274 CMPQ SI, R13 275 JA errCorrupt 276 277 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) 278 MOVQ CX, DX 279 ANDQ $0xe0, DX 280 SHLQ $3, DX 281 MOVBQZX -1(SI), BX 282 ORQ BX, DX 283 284 // length = 4 + int(src[s-2])>>2&0x7 285 SHRQ $2, CX 286 ANDQ $7, CX 287 ADDQ $4, CX 288 289doCopy: 290 // This is the end of the outer "switch", when we have a copy tag. 291 // 292 // We assume that: 293 // - CX == length && CX > 0 294 // - DX == offset 295 296 // if offset <= 0 { etc } 297 CMPQ DX, $0 298 JLE errCorrupt 299 300 // if d < offset { etc } 301 MOVQ DI, BX 302 SUBQ R8, BX 303 CMPQ BX, DX 304 JLT errCorrupt 305 306 // if length > len(dst)-d { etc } 307 MOVQ R10, BX 308 SUBQ DI, BX 309 CMPQ CX, BX 310 JGT errCorrupt 311 312 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length 313 // 314 // Set: 315 // - R14 = len(dst)-d 316 // - R15 = &dst[d-offset] 317 MOVQ R10, R14 318 SUBQ DI, R14 319 MOVQ DI, R15 320 SUBQ DX, R15 321 322 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. 323 // 324 // First, try using two 8-byte load/stores, similar to the doLit technique 325 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is 326 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores 327 // and not one 16-byte load/store, and the first store has to be before the 328 // second load, due to the overlap if offset is in the range [8, 16). 329 // 330 // if length > 16 || offset < 8 || len(dst)-d < 16 { 331 // goto slowForwardCopy 332 // } 333 // copy 16 bytes 334 // d += length 335 CMPQ CX, $16 336 JGT slowForwardCopy 337 CMPQ DX, $8 338 JLT slowForwardCopy 339 CMPQ R14, $16 340 JLT slowForwardCopy 341 MOVQ 0(R15), AX 342 MOVQ AX, 0(DI) 343 MOVQ 8(R15), BX 344 MOVQ BX, 8(DI) 345 ADDQ CX, DI 346 JMP loop 347 348slowForwardCopy: 349 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we 350 // can still try 8-byte load stores, provided we can overrun up to 10 extra 351 // bytes. As above, the overrun will be fixed up by subsequent iterations 352 // of the outermost loop. 353 // 354 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its 355 // commentary says: 356 // 357 // ---- 358 // 359 // The main part of this loop is a simple copy of eight bytes at a time 360 // until we've copied (at least) the requested amount of bytes. However, 361 // if d and d-offset are less than eight bytes apart (indicating a 362 // repeating pattern of length < 8), we first need to expand the pattern in 363 // order to get the correct results. For instance, if the buffer looks like 364 // this, with the eight-byte <d-offset> and <d> patterns marked as 365 // intervals: 366 // 367 // abxxxxxxxxxxxx 368 // [------] d-offset 369 // [------] d 370 // 371 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern 372 // once, after which we can move <d> two bytes without moving <d-offset>: 373 // 374 // ababxxxxxxxxxx 375 // [------] d-offset 376 // [------] d 377 // 378 // and repeat the exercise until the two no longer overlap. 379 // 380 // This allows us to do very well in the special case of one single byte 381 // repeated many times, without taking a big hit for more general cases. 382 // 383 // The worst case of extra writing past the end of the match occurs when 384 // offset == 1 and length == 1; the last copy will read from byte positions 385 // [0..7] and write to [4..11], whereas it was only supposed to write to 386 // position 1. Thus, ten excess bytes. 387 // 388 // ---- 389 // 390 // That "10 byte overrun" worst case is confirmed by Go's 391 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy 392 // and finishSlowForwardCopy algorithm. 393 // 394 // if length > len(dst)-d-10 { 395 // goto verySlowForwardCopy 396 // } 397 SUBQ $10, R14 398 CMPQ CX, R14 399 JGT verySlowForwardCopy 400 401makeOffsetAtLeast8: 402 // !!! As above, expand the pattern so that offset >= 8 and we can use 403 // 8-byte load/stores. 404 // 405 // for offset < 8 { 406 // copy 8 bytes from dst[d-offset:] to dst[d:] 407 // length -= offset 408 // d += offset 409 // offset += offset 410 // // The two previous lines together means that d-offset, and therefore 411 // // R15, is unchanged. 412 // } 413 CMPQ DX, $8 414 JGE fixUpSlowForwardCopy 415 MOVQ (R15), BX 416 MOVQ BX, (DI) 417 SUBQ DX, CX 418 ADDQ DX, DI 419 ADDQ DX, DX 420 JMP makeOffsetAtLeast8 421 422fixUpSlowForwardCopy: 423 // !!! Add length (which might be negative now) to d (implied by DI being 424 // &dst[d]) so that d ends up at the right place when we jump back to the 425 // top of the loop. Before we do that, though, we save DI to AX so that, if 426 // length is positive, copying the remaining length bytes will write to the 427 // right place. 428 MOVQ DI, AX 429 ADDQ CX, DI 430 431finishSlowForwardCopy: 432 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative 433 // length means that we overrun, but as above, that will be fixed up by 434 // subsequent iterations of the outermost loop. 435 CMPQ CX, $0 436 JLE loop 437 MOVQ (R15), BX 438 MOVQ BX, (AX) 439 ADDQ $8, R15 440 ADDQ $8, AX 441 SUBQ $8, CX 442 JMP finishSlowForwardCopy 443 444verySlowForwardCopy: 445 // verySlowForwardCopy is a simple implementation of forward copy. In C 446 // parlance, this is a do/while loop instead of a while loop, since we know 447 // that length > 0. In Go syntax: 448 // 449 // for { 450 // dst[d] = dst[d - offset] 451 // d++ 452 // length-- 453 // if length == 0 { 454 // break 455 // } 456 // } 457 MOVB (R15), BX 458 MOVB BX, (DI) 459 INCQ R15 460 INCQ DI 461 DECQ CX 462 JNZ verySlowForwardCopy 463 JMP loop 464 465// The code above handles copy tags. 466// ---------------------------------------- 467 468end: 469 // This is the end of the "for s < len(src)". 470 // 471 // if d != len(dst) { etc } 472 CMPQ DI, R10 473 JNE errCorrupt 474 475 // return 0 476 MOVQ $0, ret+48(FP) 477 RET 478 479errCorrupt: 480 // return decodeErrCodeCorrupt 481 MOVQ $1, ret+48(FP) 482 RET 483