1 /* 2 * Copyright (c) Facebook, Inc. and its affiliates. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #pragma once 18 19 #include <cstdint> 20 #include <limits> 21 22 #include <glog/logging.h> 23 24 #include <folly/Portability.h> 25 #include <folly/Range.h> 26 #include <folly/detail/GroupVarintDetail.h> 27 #include <folly/lang/Bits.h> 28 #include <folly/portability/Builtins.h> 29 30 #if !defined(__GNUC__) && !defined(_MSC_VER) 31 #error GroupVarint.h requires GCC or MSVC 32 #endif 33 34 #if FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 || FOLLY_AARCH64 35 #define FOLLY_HAVE_GROUP_VARINT 1 36 #else 37 #define FOLLY_HAVE_GROUP_VARINT 0 38 #endif 39 40 #if FOLLY_HAVE_GROUP_VARINT 41 42 #if FOLLY_SSE >= 4 43 #include <nmmintrin.h> 44 namespace folly { 45 namespace detail { 46 extern const std::array<std::array<std::uint32_t, 4>, 256> groupVarintSSEMasks; 47 } // namespace detail 48 } // namespace folly 49 #endif 50 51 namespace folly { 52 namespace detail { 53 extern const std::array<std::uint8_t, 256> groupVarintLengths; 54 } // namespace detail 55 } // namespace folly 56 57 namespace folly { 58 59 template <typename T> 60 class GroupVarint; 61 62 /** 63 * GroupVarint encoding for 32-bit values. 64 * 65 * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size. 66 * There is one byte of overhead. (The first byte contains the lengths of 67 * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes) 68 * 69 * This implementation assumes little-endian and does unaligned 32-bit 70 * accesses, so it's basically not portable outside of the x86[_64] world. 71 */ 72 template <> 73 class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> { 74 public: 75 /** 76 * Return the number of bytes used to encode these four values. 77 */ size(uint32_t a,uint32_t b,uint32_t c,uint32_t d)78 static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 79 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d); 80 } 81 82 /** 83 * Return the number of bytes used to encode four uint32_t values stored 84 * at consecutive positions in an array. 85 */ size(const uint32_t * p)86 static size_t size(const uint32_t* p) { return size(p[0], p[1], p[2], p[3]); } 87 88 /** 89 * Return the number of bytes used to encode count (<= 4) values. 90 * If you clip a buffer after these many bytes, you can still decode 91 * the first "count" values correctly (if the remaining size() - 92 * partialSize() bytes are filled with garbage). 93 */ partialSize(const type * p,size_t count)94 static size_t partialSize(const type* p, size_t count) { 95 DCHECK_LE(count, kGroupSize); 96 size_t s = kHeaderSize + count; 97 for (; count; --count, ++p) { 98 s += key(*p); 99 } 100 return s; 101 } 102 103 /** 104 * Return the number of values from *p that are valid from an encoded 105 * buffer of size bytes. 106 */ partialCount(const char * p,size_t size)107 static size_t partialCount(const char* p, size_t size) { 108 uint8_t v = uint8_t(*p); 109 size_t s = kHeaderSize; 110 s += 1 + b0key(v); 111 if (s > size) { 112 return 0; 113 } 114 s += 1 + b1key(v); 115 if (s > size) { 116 return 1; 117 } 118 s += 1 + b2key(v); 119 if (s > size) { 120 return 2; 121 } 122 s += 1 + b3key(v); 123 if (s > size) { 124 return 3; 125 } 126 return 4; 127 } 128 129 /** 130 * Given a pointer to the beginning of an GroupVarint32-encoded block, 131 * return the number of bytes used by the encoding. 132 */ encodedSize(const char * p)133 static size_t encodedSize(const char* p) { 134 return kHeaderSize + kGroupSize + b0key(uint8_t(*p)) + b1key(uint8_t(*p)) + 135 b2key(uint8_t(*p)) + b3key(uint8_t(*p)); 136 } 137 138 /** 139 * Encode four uint32_t values into the buffer pointed-to by p, and return 140 * the next position in the buffer (that is, one character past the last 141 * encoded byte). p needs to have at least size()+4 bytes available. 142 */ encode(char * p,uint32_t a,uint32_t b,uint32_t c,uint32_t d)143 static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) { 144 uint8_t b0key = key(a); 145 uint8_t b1key = key(b); 146 uint8_t b2key = key(c); 147 uint8_t b3key = key(d); 148 *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key; 149 storeUnaligned(p, a); 150 p += b0key + 1; 151 storeUnaligned(p, b); 152 p += b1key + 1; 153 storeUnaligned(p, c); 154 p += b2key + 1; 155 storeUnaligned(p, d); 156 p += b3key + 1; 157 return p; 158 } 159 160 /** 161 * Encode four uint32_t values from the array pointed-to by src into the 162 * buffer pointed-to by p, similar to encode(p,a,b,c,d) above. 163 */ encode(char * p,const uint32_t * src)164 static char* encode(char* p, const uint32_t* src) { 165 return encode(p, src[0], src[1], src[2], src[3]); 166 } 167 168 /** 169 * Decode four uint32_t values from a buffer, and return the next position 170 * in the buffer (that is, one character past the last encoded byte). 171 * The buffer needs to have at least 3 extra bytes available (they 172 * may be read but ignored). 173 */ decode_simple(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)174 static const char* decode_simple( 175 const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) { 176 size_t k = loadUnaligned<uint8_t>(p); 177 const char* end = p + detail::groupVarintLengths[k]; 178 ++p; 179 size_t k0 = b0key(k); 180 *a = loadUnaligned<uint32_t>(p) & kMask[k0]; 181 p += k0 + 1; 182 size_t k1 = b1key(k); 183 *b = loadUnaligned<uint32_t>(p) & kMask[k1]; 184 p += k1 + 1; 185 size_t k2 = b2key(k); 186 *c = loadUnaligned<uint32_t>(p) & kMask[k2]; 187 p += k2 + 1; 188 size_t k3 = b3key(k); 189 *d = loadUnaligned<uint32_t>(p) & kMask[k3]; 190 // p += k3+1; 191 return end; 192 } 193 194 /** 195 * Decode four uint32_t values from a buffer and store them in the array 196 * pointed-to by dest, similar to decode(p,a,b,c,d) above. 197 */ decode_simple(const char * p,uint32_t * dest)198 static const char* decode_simple(const char* p, uint32_t* dest) { 199 return decode_simple(p, dest, dest + 1, dest + 2, dest + 3); 200 } 201 202 #if FOLLY_SSE >= 4 203 /** 204 * Just like the non-SSSE3 decode below, but with the additional constraint 205 * that we must be able to read at least 17 bytes from the input pointer, p. 206 */ decode(const char * p,uint32_t * dest)207 static const char* decode(const char* p, uint32_t* dest) { 208 uint8_t key = uint8_t(p[0]); 209 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1)); 210 __m128i mask = 211 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data()); 212 __m128i r = _mm_shuffle_epi8(val, mask); 213 _mm_storeu_si128((__m128i*)dest, r); 214 return p + detail::groupVarintLengths[key]; 215 } 216 217 /** 218 * Just like decode_simple, but with the additional constraint that 219 * we must be able to read at least 17 bytes from the input pointer, p. 220 */ decode(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)221 static const char* decode( 222 const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) { 223 uint8_t key = uint8_t(p[0]); 224 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1)); 225 __m128i mask = 226 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data()); 227 __m128i r = _mm_shuffle_epi8(val, mask); 228 229 *a = uint32_t(_mm_extract_epi32(r, 0)); 230 *b = uint32_t(_mm_extract_epi32(r, 1)); 231 *c = uint32_t(_mm_extract_epi32(r, 2)); 232 *d = uint32_t(_mm_extract_epi32(r, 3)); 233 234 return p + detail::groupVarintLengths[key]; 235 } 236 237 #else // FOLLY_SSE >= 4 decode(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)238 static const char* decode( 239 const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) { 240 return decode_simple(p, a, b, c, d); 241 } 242 decode(const char * p,uint32_t * dest)243 static const char* decode(const char* p, uint32_t* dest) { 244 return decode_simple(p, dest); 245 } 246 #endif // FOLLY_SSE >= 4 247 248 private: key(uint32_t x)249 static uint8_t key(uint32_t x) { 250 // __builtin_clz is undefined for the x==0 case 251 return uint8_t(3 - (__builtin_clz(x | 1) / 8)); 252 } b0key(size_t x)253 static size_t b0key(size_t x) { return x & 3; } b1key(size_t x)254 static size_t b1key(size_t x) { return (x >> 2) & 3; } b2key(size_t x)255 static size_t b2key(size_t x) { return (x >> 4) & 3; } b3key(size_t x)256 static size_t b3key(size_t x) { return (x >> 6) & 3; } 257 258 static const uint32_t kMask[]; 259 }; 260 261 /** 262 * GroupVarint encoding for 64-bit values. 263 * 264 * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size. 265 * There are two bytes of overhead. (The first two bytes contain the lengths 266 * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes) 267 * 268 * This implementation assumes little-endian and does unaligned 64-bit 269 * accesses, so it's basically not portable outside of the x86[_64] world. 270 */ 271 template <> 272 class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> { 273 public: 274 /** 275 * Return the number of bytes used to encode these five values. 276 */ size(uint64_t a,uint64_t b,uint64_t c,uint64_t d,uint64_t e)277 static size_t size( 278 uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) { 279 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d) + 280 key(e); 281 } 282 283 /** 284 * Return the number of bytes used to encode five uint64_t values stored 285 * at consecutive positions in an array. 286 */ size(const uint64_t * p)287 static size_t size(const uint64_t* p) { 288 return size(p[0], p[1], p[2], p[3], p[4]); 289 } 290 291 /** 292 * Return the number of bytes used to encode count (<= 4) values. 293 * If you clip a buffer after these many bytes, you can still decode 294 * the first "count" values correctly (if the remaining size() - 295 * partialSize() bytes are filled with garbage). 296 */ partialSize(const type * p,size_t count)297 static size_t partialSize(const type* p, size_t count) { 298 DCHECK_LE(count, kGroupSize); 299 size_t s = kHeaderSize + count; 300 for (; count; --count, ++p) { 301 s += key(*p); 302 } 303 return s; 304 } 305 306 /** 307 * Return the number of values from *p that are valid from an encoded 308 * buffer of size bytes. 309 */ partialCount(const char * p,size_t size)310 static size_t partialCount(const char* p, size_t size) { 311 uint16_t v = loadUnaligned<uint16_t>(p); 312 size_t s = kHeaderSize; 313 s += 1 + b0key(v); 314 if (s > size) { 315 return 0; 316 } 317 s += 1 + b1key(v); 318 if (s > size) { 319 return 1; 320 } 321 s += 1 + b2key(v); 322 if (s > size) { 323 return 2; 324 } 325 s += 1 + b3key(v); 326 if (s > size) { 327 return 3; 328 } 329 s += 1 + b4key(v); 330 if (s > size) { 331 return 4; 332 } 333 return 5; 334 } 335 336 /** 337 * Given a pointer to the beginning of an GroupVarint64-encoded block, 338 * return the number of bytes used by the encoding. 339 */ encodedSize(const char * p)340 static size_t encodedSize(const char* p) { 341 uint16_t n = loadUnaligned<uint16_t>(p); 342 return kHeaderSize + kGroupSize + b0key(n) + b1key(n) + b2key(n) + 343 b3key(n) + b4key(n); 344 } 345 346 /** 347 * Encode five uint64_t values into the buffer pointed-to by p, and return 348 * the next position in the buffer (that is, one character past the last 349 * encoded byte). p needs to have at least size()+8 bytes available. 350 */ encode(char * p,uint64_t a,uint64_t b,uint64_t c,uint64_t d,uint64_t e)351 static char* encode( 352 char* p, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) { 353 uint16_t b0key = key(a); 354 uint16_t b1key = key(b); 355 uint16_t b2key = key(c); 356 uint16_t b3key = key(d); 357 uint16_t b4key = key(e); 358 storeUnaligned<uint16_t>( 359 p, 360 uint16_t( 361 (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) | 362 b0key)); 363 p += 2; 364 storeUnaligned(p, a); 365 p += b0key + 1; 366 storeUnaligned(p, b); 367 p += b1key + 1; 368 storeUnaligned(p, c); 369 p += b2key + 1; 370 storeUnaligned(p, d); 371 p += b3key + 1; 372 storeUnaligned(p, e); 373 p += b4key + 1; 374 return p; 375 } 376 377 /** 378 * Encode five uint64_t values from the array pointed-to by src into the 379 * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above. 380 */ encode(char * p,const uint64_t * src)381 static char* encode(char* p, const uint64_t* src) { 382 return encode(p, src[0], src[1], src[2], src[3], src[4]); 383 } 384 385 /** 386 * Decode five uint64_t values from a buffer, and return the next position 387 * in the buffer (that is, one character past the last encoded byte). 388 * The buffer needs to have at least 7 bytes available (they may be read 389 * but ignored). 390 */ decode(const char * p,uint64_t * a,uint64_t * b,uint64_t * c,uint64_t * d,uint64_t * e)391 static const char* decode( 392 const char* p, 393 uint64_t* a, 394 uint64_t* b, 395 uint64_t* c, 396 uint64_t* d, 397 uint64_t* e) { 398 uint16_t k = loadUnaligned<uint16_t>(p); 399 p += 2; 400 uint8_t k0 = b0key(k); 401 *a = loadUnaligned<uint64_t>(p) & kMask[k0]; 402 p += k0 + 1; 403 uint8_t k1 = b1key(k); 404 *b = loadUnaligned<uint64_t>(p) & kMask[k1]; 405 p += k1 + 1; 406 uint8_t k2 = b2key(k); 407 *c = loadUnaligned<uint64_t>(p) & kMask[k2]; 408 p += k2 + 1; 409 uint8_t k3 = b3key(k); 410 *d = loadUnaligned<uint64_t>(p) & kMask[k3]; 411 p += k3 + 1; 412 uint8_t k4 = b4key(k); 413 *e = loadUnaligned<uint64_t>(p) & kMask[k4]; 414 p += k4 + 1; 415 return p; 416 } 417 418 /** 419 * Decode five uint64_t values from a buffer and store them in the array 420 * pointed-to by dest, similar to decode(p,a,b,c,d,e) above. 421 */ decode(const char * p,uint64_t * dest)422 static const char* decode(const char* p, uint64_t* dest) { 423 return decode(p, dest, dest + 1, dest + 2, dest + 3, dest + 4); 424 } 425 426 private: 427 enum { kHeaderBytes = 2 }; 428 key(uint64_t x)429 static uint8_t key(uint64_t x) { 430 // __builtin_clzll is undefined for the x==0 case 431 return uint8_t(7 - (__builtin_clzll(x | 1) / 8)); 432 } 433 b0key(uint16_t x)434 static uint8_t b0key(uint16_t x) { return x & 7u; } b1key(uint16_t x)435 static uint8_t b1key(uint16_t x) { return (x >> 3) & 7u; } b2key(uint16_t x)436 static uint8_t b2key(uint16_t x) { return (x >> 6) & 7u; } b3key(uint16_t x)437 static uint8_t b3key(uint16_t x) { return (x >> 9) & 7u; } b4key(uint16_t x)438 static uint8_t b4key(uint16_t x) { return (x >> 12) & 7u; } 439 440 static const uint64_t kMask[]; 441 }; 442 443 typedef GroupVarint<uint32_t> GroupVarint32; 444 typedef GroupVarint<uint64_t> GroupVarint64; 445 446 /** 447 * Simplify use of GroupVarint* for the case where data is available one 448 * entry at a time (instead of one group at a time). Handles buffering 449 * and an incomplete last chunk. 450 * 451 * Output is a function object that accepts character ranges: 452 * out(StringPiece) appends the given character range to the output. 453 */ 454 template <class T, class Output> 455 class GroupVarintEncoder { 456 public: 457 typedef GroupVarint<T> Base; 458 typedef T type; 459 GroupVarintEncoder(Output out)460 explicit GroupVarintEncoder(Output out) : out_(out), count_(0) {} 461 ~GroupVarintEncoder()462 ~GroupVarintEncoder() { finish(); } 463 464 /** 465 * Add a value to the encoder. 466 */ add(type val)467 void add(type val) { 468 buf_[count_++] = val; 469 if (count_ == Base::kGroupSize) { 470 char* p = Base::encode(tmp_, buf_); 471 out_(StringPiece(tmp_, p)); 472 count_ = 0; 473 } 474 } 475 476 /** 477 * Finish encoding, flushing any buffered values if necessary. 478 * After finish(), the encoder is immediately ready to encode more data 479 * to the same output. 480 */ finish()481 void finish() { 482 if (count_) { 483 // This is not strictly necessary, but it makes testing easy; 484 // uninitialized bytes are guaranteed to be recorded as taking one byte 485 // (not more). 486 for (size_t i = count_; i < Base::kGroupSize; i++) { 487 buf_[i] = 0; 488 } 489 Base::encode(tmp_, buf_); 490 out_(StringPiece(tmp_, Base::partialSize(buf_, count_))); 491 count_ = 0; 492 } 493 } 494 495 /** 496 * Return the appender that was used. 497 */ output()498 Output& output() { return out_; } output()499 const Output& output() const { return out_; } 500 501 /** 502 * Reset the encoder, disregarding any state (except what was already 503 * flushed to the output, of course). 504 */ clear()505 void clear() { count_ = 0; } 506 507 private: 508 Output out_; 509 char tmp_[Base::kMaxSize]; 510 type buf_[Base::kGroupSize]; 511 size_t count_; 512 }; 513 514 /** 515 * Simplify use of GroupVarint* for the case where the last group in the 516 * input may be incomplete (but the exact size of the input is known). 517 * Allows for extracting values one at a time. 518 */ 519 template <typename T> 520 class GroupVarintDecoder { 521 public: 522 typedef GroupVarint<T> Base; 523 typedef T type; 524 525 GroupVarintDecoder() = default; 526 527 explicit GroupVarintDecoder(StringPiece data, size_t maxCount = (size_t)-1) 528 : rrest_(data.end()), 529 p_(data.data()), 530 end_(data.end()), 531 limit_(end_), 532 pos_(0), 533 count_(0), 534 remaining_(maxCount) {} 535 536 void reset(StringPiece data, size_t maxCount = (size_t)-1) { 537 rrest_ = data.end(); 538 p_ = data.data(); 539 end_ = data.end(); 540 limit_ = end_; 541 pos_ = 0; 542 count_ = 0; 543 remaining_ = maxCount; 544 } 545 546 /** 547 * Read and return the next value. 548 */ next(type * val)549 bool next(type* val) { 550 if (pos_ == count_) { 551 // refill 552 size_t rem = size_t(end_ - p_); 553 if (rem == 0 || remaining_ == 0) { 554 return false; 555 } 556 // next() attempts to read one full group at a time, and so we must have 557 // at least enough bytes readable after its end to handle the case if the 558 // last group is full. 559 // 560 // The best way to ensure this is to ensure that data has at least 561 // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy 562 // into a temporary buffer. 563 if (limit_ - p_ < Base::kMaxSize) { 564 memcpy(tmp_, p_, rem); 565 p_ = tmp_; 566 end_ = p_ + rem; 567 limit_ = tmp_ + sizeof(tmp_); 568 } 569 pos_ = 0; 570 const char* n = Base::decode(p_, buf_); 571 if (n <= end_) { 572 // Full group could be decoded 573 if (remaining_ >= Base::kGroupSize) { 574 remaining_ -= Base::kGroupSize; 575 count_ = Base::kGroupSize; 576 p_ = n; 577 } else { 578 count_ = remaining_; 579 remaining_ = 0; 580 p_ += Base::partialSize(buf_, count_); 581 } 582 } else { 583 // Can't decode a full group 584 count_ = Base::partialCount(p_, size_t(end_ - p_)); 585 if (remaining_ >= count_) { 586 remaining_ -= count_; 587 p_ = end_; 588 } else { 589 count_ = remaining_; 590 remaining_ = 0; 591 p_ += Base::partialSize(buf_, count_); 592 } 593 if (count_ == 0) { 594 return false; 595 } 596 } 597 } 598 *val = buf_[pos_++]; 599 return true; 600 } 601 rest()602 StringPiece rest() const { 603 // This is only valid after next() returned false 604 CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0)); 605 // p_ may point to the internal buffer (tmp_), but we want 606 // to return subpiece of the original data 607 size_t size = size_t(end_ - p_); 608 return StringPiece(rrest_ - size, rrest_); 609 } 610 611 private: 612 const char* rrest_; 613 const char* p_; 614 const char* end_; 615 const char* limit_; 616 char tmp_[2 * Base::kMaxSize]; 617 type buf_[Base::kGroupSize]; 618 size_t pos_; 619 size_t count_; 620 size_t remaining_; 621 }; 622 623 typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder; 624 typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder; 625 626 } // namespace folly 627 628 #endif // FOLLY_HAVE_GROUP_VARINT 629