1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !appengine 6// +build gc 7// +build !noasm 8 9#include "textflag.h" 10 11// The asm code generally follows the pure Go code in decode_other.go, except 12// where marked with a "!!!". 13 14// func decode(dst, src []byte) int 15// 16// All local variables fit into registers. The non-zero stack size is only to 17// spill registers and push args when issuing a CALL. The register allocation: 18// - AX scratch 19// - BX scratch 20// - CX length or x 21// - DX offset 22// - SI &src[s] 23// - DI &dst[d] 24// + R8 dst_base 25// + R9 dst_len 26// + R10 dst_base + dst_len 27// + R11 src_base 28// + R12 src_len 29// + R13 src_base + src_len 30// - R14 used by doCopy 31// - R15 used by doCopy 32// 33// The registers R8-R13 (marked with a "+") are set at the start of the 34// function, and after a CALL returns, and are not otherwise modified. 35// 36// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. 37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. 38TEXT ·decode(SB), NOSPLIT, $48-56 39 // Initialize SI, DI and R8-R13. 40 MOVQ dst_base+0(FP), R8 41 MOVQ dst_len+8(FP), R9 42 MOVQ R8, DI 43 MOVQ R8, R10 44 ADDQ R9, R10 45 MOVQ src_base+24(FP), R11 46 MOVQ src_len+32(FP), R12 47 MOVQ R11, SI 48 MOVQ R11, R13 49 ADDQ R12, R13 50 51loop: 52 // for s < len(src) 53 CMPQ SI, R13 54 JEQ end 55 56 // CX = uint32(src[s]) 57 // 58 // switch src[s] & 0x03 59 MOVBLZX (SI), CX 60 MOVL CX, BX 61 ANDL $3, BX 62 CMPL BX, $1 63 JAE tagCopy 64 65 // ---------------------------------------- 66 // The code below handles literal tags. 67 68 // case tagLiteral: 69 // x := uint32(src[s] >> 2) 70 // switch 71 SHRL $2, CX 72 CMPL CX, $60 73 JAE tagLit60Plus 74 75 // case x < 60: 76 // s++ 77 INCQ SI 78 79doLit: 80 // This is the end of the inner "switch", when we have a literal tag. 81 // 82 // We assume that CX == x and x fits in a uint32, where x is the variable 83 // used in the pure Go decode_other.go code. 84 85 // length = int(x) + 1 86 // 87 // Unlike the pure Go code, we don't need to check if length <= 0 because 88 // CX can hold 64 bits, so the increment cannot overflow. 89 INCQ CX 90 91 // Prepare to check if copying length bytes will run past the end of dst or 92 // src. 93 // 94 // AX = len(dst) - d 95 // BX = len(src) - s 96 MOVQ R10, AX 97 SUBQ DI, AX 98 MOVQ R13, BX 99 SUBQ SI, BX 100 101 // !!! Try a faster technique for short (16 or fewer bytes) copies. 102 // 103 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { 104 // goto callMemmove // Fall back on calling runtime·memmove. 105 // } 106 // 107 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s 108 // against 21 instead of 16, because it cannot assume that all of its input 109 // is contiguous in memory and so it needs to leave enough source bytes to 110 // read the next tag without refilling buffers, but Go's Decode assumes 111 // contiguousness (the src argument is a []byte). 112 CMPQ CX, $16 113 JGT callMemmove 114 CMPQ AX, $16 115 JLT callMemmove 116 CMPQ BX, $16 117 JLT callMemmove 118 119 // !!! Implement the copy from src to dst as a 16-byte load and store. 120 // (Decode's documentation says that dst and src must not overlap.) 121 // 122 // This always copies 16 bytes, instead of only length bytes, but that's 123 // OK. If the input is a valid Snappy encoding then subsequent iterations 124 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a 125 // non-nil error), so the overrun will be ignored. 126 // 127 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or 128 // 16-byte loads and stores. This technique probably wouldn't be as 129 // effective on architectures that are fussier about alignment. 130 MOVOU 0(SI), X0 131 MOVOU X0, 0(DI) 132 133 // d += length 134 // s += length 135 ADDQ CX, DI 136 ADDQ CX, SI 137 JMP loop 138 139callMemmove: 140 // if length > len(dst)-d || length > len(src)-s { etc } 141 CMPQ CX, AX 142 JGT errCorrupt 143 CMPQ CX, BX 144 JGT errCorrupt 145 146 // copy(dst[d:], src[s:s+length]) 147 // 148 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push 149 // DI, SI and CX as arguments. Coincidentally, we also need to spill those 150 // three registers to the stack, to save local variables across the CALL. 151 MOVQ DI, 0(SP) 152 MOVQ SI, 8(SP) 153 MOVQ CX, 16(SP) 154 MOVQ DI, 24(SP) 155 MOVQ SI, 32(SP) 156 MOVQ CX, 40(SP) 157 CALL runtime·memmove(SB) 158 159 // Restore local variables: unspill registers from the stack and 160 // re-calculate R8-R13. 161 MOVQ 24(SP), DI 162 MOVQ 32(SP), SI 163 MOVQ 40(SP), CX 164 MOVQ dst_base+0(FP), R8 165 MOVQ dst_len+8(FP), R9 166 MOVQ R8, R10 167 ADDQ R9, R10 168 MOVQ src_base+24(FP), R11 169 MOVQ src_len+32(FP), R12 170 MOVQ R11, R13 171 ADDQ R12, R13 172 173 // d += length 174 // s += length 175 ADDQ CX, DI 176 ADDQ CX, SI 177 JMP loop 178 179tagLit60Plus: 180 // !!! This fragment does the 181 // 182 // s += x - 58; if uint(s) > uint(len(src)) { etc } 183 // 184 // checks. In the asm version, we code it once instead of once per switch case. 185 ADDQ CX, SI 186 SUBQ $58, SI 187 MOVQ SI, BX 188 SUBQ R11, BX 189 CMPQ BX, R12 190 JA errCorrupt 191 192 // case x == 60: 193 CMPL CX, $61 194 JEQ tagLit61 195 JA tagLit62Plus 196 197 // x = uint32(src[s-1]) 198 MOVBLZX -1(SI), CX 199 JMP doLit 200 201tagLit61: 202 // case x == 61: 203 // x = uint32(src[s-2]) | uint32(src[s-1])<<8 204 MOVWLZX -2(SI), CX 205 JMP doLit 206 207tagLit62Plus: 208 CMPL CX, $62 209 JA tagLit63 210 211 // case x == 62: 212 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 213 MOVWLZX -3(SI), CX 214 MOVBLZX -1(SI), BX 215 SHLL $16, BX 216 ORL BX, CX 217 JMP doLit 218 219tagLit63: 220 // case x == 63: 221 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 222 MOVL -4(SI), CX 223 JMP doLit 224 225// The code above handles literal tags. 226// ---------------------------------------- 227// The code below handles copy tags. 228 229tagCopy4: 230 // case tagCopy4: 231 // s += 5 232 ADDQ $5, SI 233 234 // if uint(s) > uint(len(src)) { etc } 235 MOVQ SI, BX 236 SUBQ R11, BX 237 CMPQ BX, R12 238 JA errCorrupt 239 240 // length = 1 + int(src[s-5])>>2 241 SHRQ $2, CX 242 INCQ CX 243 244 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) 245 MOVLQZX -4(SI), DX 246 JMP doCopy 247 248tagCopy2: 249 // case tagCopy2: 250 // s += 3 251 ADDQ $3, SI 252 253 // if uint(s) > uint(len(src)) { etc } 254 MOVQ SI, BX 255 SUBQ R11, BX 256 CMPQ BX, R12 257 JA errCorrupt 258 259 // length = 1 + int(src[s-3])>>2 260 SHRQ $2, CX 261 INCQ CX 262 263 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) 264 MOVWQZX -2(SI), DX 265 JMP doCopy 266 267tagCopy: 268 // We have a copy tag. We assume that: 269 // - BX == src[s] & 0x03 270 // - CX == src[s] 271 CMPQ BX, $2 272 JEQ tagCopy2 273 JA tagCopy4 274 275 // case tagCopy1: 276 // s += 2 277 ADDQ $2, SI 278 279 // if uint(s) > uint(len(src)) { etc } 280 MOVQ SI, BX 281 SUBQ R11, BX 282 CMPQ BX, R12 283 JA errCorrupt 284 285 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) 286 MOVQ CX, DX 287 ANDQ $0xe0, DX 288 SHLQ $3, DX 289 MOVBQZX -1(SI), BX 290 ORQ BX, DX 291 292 // length = 4 + int(src[s-2])>>2&0x7 293 SHRQ $2, CX 294 ANDQ $7, CX 295 ADDQ $4, CX 296 297doCopy: 298 // This is the end of the outer "switch", when we have a copy tag. 299 // 300 // We assume that: 301 // - CX == length && CX > 0 302 // - DX == offset 303 304 // if offset <= 0 { etc } 305 CMPQ DX, $0 306 JLE errCorrupt 307 308 // if d < offset { etc } 309 MOVQ DI, BX 310 SUBQ R8, BX 311 CMPQ BX, DX 312 JLT errCorrupt 313 314 // if length > len(dst)-d { etc } 315 MOVQ R10, BX 316 SUBQ DI, BX 317 CMPQ CX, BX 318 JGT errCorrupt 319 320 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length 321 // 322 // Set: 323 // - R14 = len(dst)-d 324 // - R15 = &dst[d-offset] 325 MOVQ R10, R14 326 SUBQ DI, R14 327 MOVQ DI, R15 328 SUBQ DX, R15 329 330 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. 331 // 332 // First, try using two 8-byte load/stores, similar to the doLit technique 333 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is 334 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores 335 // and not one 16-byte load/store, and the first store has to be before the 336 // second load, due to the overlap if offset is in the range [8, 16). 337 // 338 // if length > 16 || offset < 8 || len(dst)-d < 16 { 339 // goto slowForwardCopy 340 // } 341 // copy 16 bytes 342 // d += length 343 CMPQ CX, $16 344 JGT slowForwardCopy 345 CMPQ DX, $8 346 JLT slowForwardCopy 347 CMPQ R14, $16 348 JLT slowForwardCopy 349 MOVQ 0(R15), AX 350 MOVQ AX, 0(DI) 351 MOVQ 8(R15), BX 352 MOVQ BX, 8(DI) 353 ADDQ CX, DI 354 JMP loop 355 356slowForwardCopy: 357 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we 358 // can still try 8-byte load stores, provided we can overrun up to 10 extra 359 // bytes. As above, the overrun will be fixed up by subsequent iterations 360 // of the outermost loop. 361 // 362 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its 363 // commentary says: 364 // 365 // ---- 366 // 367 // The main part of this loop is a simple copy of eight bytes at a time 368 // until we've copied (at least) the requested amount of bytes. However, 369 // if d and d-offset are less than eight bytes apart (indicating a 370 // repeating pattern of length < 8), we first need to expand the pattern in 371 // order to get the correct results. For instance, if the buffer looks like 372 // this, with the eight-byte <d-offset> and <d> patterns marked as 373 // intervals: 374 // 375 // abxxxxxxxxxxxx 376 // [------] d-offset 377 // [------] d 378 // 379 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern 380 // once, after which we can move <d> two bytes without moving <d-offset>: 381 // 382 // ababxxxxxxxxxx 383 // [------] d-offset 384 // [------] d 385 // 386 // and repeat the exercise until the two no longer overlap. 387 // 388 // This allows us to do very well in the special case of one single byte 389 // repeated many times, without taking a big hit for more general cases. 390 // 391 // The worst case of extra writing past the end of the match occurs when 392 // offset == 1 and length == 1; the last copy will read from byte positions 393 // [0..7] and write to [4..11], whereas it was only supposed to write to 394 // position 1. Thus, ten excess bytes. 395 // 396 // ---- 397 // 398 // That "10 byte overrun" worst case is confirmed by Go's 399 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy 400 // and finishSlowForwardCopy algorithm. 401 // 402 // if length > len(dst)-d-10 { 403 // goto verySlowForwardCopy 404 // } 405 SUBQ $10, R14 406 CMPQ CX, R14 407 JGT verySlowForwardCopy 408 409makeOffsetAtLeast8: 410 // !!! As above, expand the pattern so that offset >= 8 and we can use 411 // 8-byte load/stores. 412 // 413 // for offset < 8 { 414 // copy 8 bytes from dst[d-offset:] to dst[d:] 415 // length -= offset 416 // d += offset 417 // offset += offset 418 // // The two previous lines together means that d-offset, and therefore 419 // // R15, is unchanged. 420 // } 421 CMPQ DX, $8 422 JGE fixUpSlowForwardCopy 423 MOVQ (R15), BX 424 MOVQ BX, (DI) 425 SUBQ DX, CX 426 ADDQ DX, DI 427 ADDQ DX, DX 428 JMP makeOffsetAtLeast8 429 430fixUpSlowForwardCopy: 431 // !!! Add length (which might be negative now) to d (implied by DI being 432 // &dst[d]) so that d ends up at the right place when we jump back to the 433 // top of the loop. Before we do that, though, we save DI to AX so that, if 434 // length is positive, copying the remaining length bytes will write to the 435 // right place. 436 MOVQ DI, AX 437 ADDQ CX, DI 438 439finishSlowForwardCopy: 440 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative 441 // length means that we overrun, but as above, that will be fixed up by 442 // subsequent iterations of the outermost loop. 443 CMPQ CX, $0 444 JLE loop 445 MOVQ (R15), BX 446 MOVQ BX, (AX) 447 ADDQ $8, R15 448 ADDQ $8, AX 449 SUBQ $8, CX 450 JMP finishSlowForwardCopy 451 452verySlowForwardCopy: 453 // verySlowForwardCopy is a simple implementation of forward copy. In C 454 // parlance, this is a do/while loop instead of a while loop, since we know 455 // that length > 0. In Go syntax: 456 // 457 // for { 458 // dst[d] = dst[d - offset] 459 // d++ 460 // length-- 461 // if length == 0 { 462 // break 463 // } 464 // } 465 MOVB (R15), BX 466 MOVB BX, (DI) 467 INCQ R15 468 INCQ DI 469 DECQ CX 470 JNZ verySlowForwardCopy 471 JMP loop 472 473// The code above handles copy tags. 474// ---------------------------------------- 475 476end: 477 // This is the end of the "for s < len(src)". 478 // 479 // if d != len(dst) { etc } 480 CMPQ DI, R10 481 JNE errCorrupt 482 483 // return 0 484 MOVQ $0, ret+48(FP) 485 RET 486 487errCorrupt: 488 // return decodeErrCodeCorrupt 489 MOVQ $1, ret+48(FP) 490 RET 491