1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdint>
20 #include <limits>
21 
22 #include <glog/logging.h>
23 
24 #include <folly/Portability.h>
25 #include <folly/Range.h>
26 #include <folly/detail/GroupVarintDetail.h>
27 #include <folly/lang/Bits.h>
28 #include <folly/portability/Builtins.h>
29 
30 #if !defined(__GNUC__) && !defined(_MSC_VER)
31 #error GroupVarint.h requires GCC or MSVC
32 #endif
33 
34 #if FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 || FOLLY_AARCH64
35 #define FOLLY_HAVE_GROUP_VARINT 1
36 #else
37 #define FOLLY_HAVE_GROUP_VARINT 0
38 #endif
39 
40 #if FOLLY_HAVE_GROUP_VARINT
41 
42 #if FOLLY_SSE >= 4
43 #include <nmmintrin.h>
44 namespace folly {
45 namespace detail {
46 extern const std::array<std::array<std::uint32_t, 4>, 256> groupVarintSSEMasks;
47 } // namespace detail
48 } // namespace folly
49 #endif
50 
51 namespace folly {
52 namespace detail {
53 extern const std::array<std::uint8_t, 256> groupVarintLengths;
54 } // namespace detail
55 } // namespace folly
56 
57 namespace folly {
58 
59 template <typename T>
60 class GroupVarint;
61 
62 /**
63  * GroupVarint encoding for 32-bit values.
64  *
65  * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
66  * There is one byte of overhead.  (The first byte contains the lengths of
67  * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
68  *
69  * This implementation assumes little-endian and does unaligned 32-bit
70  * accesses, so it's basically not portable outside of the x86[_64] world.
71  */
72 template <>
73 class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
74  public:
75   /**
76    * Return the number of bytes used to encode these four values.
77    */
size(uint32_t a,uint32_t b,uint32_t c,uint32_t d)78   static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
79     return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
80   }
81 
82   /**
83    * Return the number of bytes used to encode four uint32_t values stored
84    * at consecutive positions in an array.
85    */
size(const uint32_t * p)86   static size_t size(const uint32_t* p) { return size(p[0], p[1], p[2], p[3]); }
87 
88   /**
89    * Return the number of bytes used to encode count (<= 4) values.
90    * If you clip a buffer after these many bytes, you can still decode
91    * the first "count" values correctly (if the remaining size() -
92    * partialSize() bytes are filled with garbage).
93    */
partialSize(const type * p,size_t count)94   static size_t partialSize(const type* p, size_t count) {
95     DCHECK_LE(count, kGroupSize);
96     size_t s = kHeaderSize + count;
97     for (; count; --count, ++p) {
98       s += key(*p);
99     }
100     return s;
101   }
102 
103   /**
104    * Return the number of values from *p that are valid from an encoded
105    * buffer of size bytes.
106    */
partialCount(const char * p,size_t size)107   static size_t partialCount(const char* p, size_t size) {
108     uint8_t v = uint8_t(*p);
109     size_t s = kHeaderSize;
110     s += 1 + b0key(v);
111     if (s > size) {
112       return 0;
113     }
114     s += 1 + b1key(v);
115     if (s > size) {
116       return 1;
117     }
118     s += 1 + b2key(v);
119     if (s > size) {
120       return 2;
121     }
122     s += 1 + b3key(v);
123     if (s > size) {
124       return 3;
125     }
126     return 4;
127   }
128 
129   /**
130    * Given a pointer to the beginning of an GroupVarint32-encoded block,
131    * return the number of bytes used by the encoding.
132    */
encodedSize(const char * p)133   static size_t encodedSize(const char* p) {
134     return kHeaderSize + kGroupSize + b0key(uint8_t(*p)) + b1key(uint8_t(*p)) +
135         b2key(uint8_t(*p)) + b3key(uint8_t(*p));
136   }
137 
138   /**
139    * Encode four uint32_t values into the buffer pointed-to by p, and return
140    * the next position in the buffer (that is, one character past the last
141    * encoded byte).  p needs to have at least size()+4 bytes available.
142    */
encode(char * p,uint32_t a,uint32_t b,uint32_t c,uint32_t d)143   static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
144     uint8_t b0key = key(a);
145     uint8_t b1key = key(b);
146     uint8_t b2key = key(c);
147     uint8_t b3key = key(d);
148     *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
149     storeUnaligned(p, a);
150     p += b0key + 1;
151     storeUnaligned(p, b);
152     p += b1key + 1;
153     storeUnaligned(p, c);
154     p += b2key + 1;
155     storeUnaligned(p, d);
156     p += b3key + 1;
157     return p;
158   }
159 
160   /**
161    * Encode four uint32_t values from the array pointed-to by src into the
162    * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
163    */
encode(char * p,const uint32_t * src)164   static char* encode(char* p, const uint32_t* src) {
165     return encode(p, src[0], src[1], src[2], src[3]);
166   }
167 
168   /**
169    * Decode four uint32_t values from a buffer, and return the next position
170    * in the buffer (that is, one character past the last encoded byte).
171    * The buffer needs to have at least 3 extra bytes available (they
172    * may be read but ignored).
173    */
decode_simple(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)174   static const char* decode_simple(
175       const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
176     size_t k = loadUnaligned<uint8_t>(p);
177     const char* end = p + detail::groupVarintLengths[k];
178     ++p;
179     size_t k0 = b0key(k);
180     *a = loadUnaligned<uint32_t>(p) & kMask[k0];
181     p += k0 + 1;
182     size_t k1 = b1key(k);
183     *b = loadUnaligned<uint32_t>(p) & kMask[k1];
184     p += k1 + 1;
185     size_t k2 = b2key(k);
186     *c = loadUnaligned<uint32_t>(p) & kMask[k2];
187     p += k2 + 1;
188     size_t k3 = b3key(k);
189     *d = loadUnaligned<uint32_t>(p) & kMask[k3];
190     // p += k3+1;
191     return end;
192   }
193 
194   /**
195    * Decode four uint32_t values from a buffer and store them in the array
196    * pointed-to by dest, similar to decode(p,a,b,c,d) above.
197    */
decode_simple(const char * p,uint32_t * dest)198   static const char* decode_simple(const char* p, uint32_t* dest) {
199     return decode_simple(p, dest, dest + 1, dest + 2, dest + 3);
200   }
201 
202 #if FOLLY_SSE >= 4
203   /**
204    * Just like the non-SSSE3 decode below, but with the additional constraint
205    * that we must be able to read at least 17 bytes from the input pointer, p.
206    */
decode(const char * p,uint32_t * dest)207   static const char* decode(const char* p, uint32_t* dest) {
208     uint8_t key = uint8_t(p[0]);
209     __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
210     __m128i mask =
211         _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
212     __m128i r = _mm_shuffle_epi8(val, mask);
213     _mm_storeu_si128((__m128i*)dest, r);
214     return p + detail::groupVarintLengths[key];
215   }
216 
217   /**
218    * Just like decode_simple, but with the additional constraint that
219    * we must be able to read at least 17 bytes from the input pointer, p.
220    */
decode(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)221   static const char* decode(
222       const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
223     uint8_t key = uint8_t(p[0]);
224     __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
225     __m128i mask =
226         _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
227     __m128i r = _mm_shuffle_epi8(val, mask);
228 
229     *a = uint32_t(_mm_extract_epi32(r, 0));
230     *b = uint32_t(_mm_extract_epi32(r, 1));
231     *c = uint32_t(_mm_extract_epi32(r, 2));
232     *d = uint32_t(_mm_extract_epi32(r, 3));
233 
234     return p + detail::groupVarintLengths[key];
235   }
236 
237 #else // FOLLY_SSE >= 4
decode(const char * p,uint32_t * a,uint32_t * b,uint32_t * c,uint32_t * d)238   static const char* decode(
239       const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
240     return decode_simple(p, a, b, c, d);
241   }
242 
decode(const char * p,uint32_t * dest)243   static const char* decode(const char* p, uint32_t* dest) {
244     return decode_simple(p, dest);
245   }
246 #endif // FOLLY_SSE >= 4
247 
248  private:
key(uint32_t x)249   static uint8_t key(uint32_t x) {
250     // __builtin_clz is undefined for the x==0 case
251     return uint8_t(3 - (__builtin_clz(x | 1) / 8));
252   }
b0key(size_t x)253   static size_t b0key(size_t x) { return x & 3; }
b1key(size_t x)254   static size_t b1key(size_t x) { return (x >> 2) & 3; }
b2key(size_t x)255   static size_t b2key(size_t x) { return (x >> 4) & 3; }
b3key(size_t x)256   static size_t b3key(size_t x) { return (x >> 6) & 3; }
257 
258   static const uint32_t kMask[];
259 };
260 
261 /**
262  * GroupVarint encoding for 64-bit values.
263  *
264  * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
265  * There are two bytes of overhead.  (The first two bytes contain the lengths
266  * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
267  *
268  * This implementation assumes little-endian and does unaligned 64-bit
269  * accesses, so it's basically not portable outside of the x86[_64] world.
270  */
271 template <>
272 class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
273  public:
274   /**
275    * Return the number of bytes used to encode these five values.
276    */
size(uint64_t a,uint64_t b,uint64_t c,uint64_t d,uint64_t e)277   static size_t size(
278       uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
279     return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d) +
280         key(e);
281   }
282 
283   /**
284    * Return the number of bytes used to encode five uint64_t values stored
285    * at consecutive positions in an array.
286    */
size(const uint64_t * p)287   static size_t size(const uint64_t* p) {
288     return size(p[0], p[1], p[2], p[3], p[4]);
289   }
290 
291   /**
292    * Return the number of bytes used to encode count (<= 4) values.
293    * If you clip a buffer after these many bytes, you can still decode
294    * the first "count" values correctly (if the remaining size() -
295    * partialSize() bytes are filled with garbage).
296    */
partialSize(const type * p,size_t count)297   static size_t partialSize(const type* p, size_t count) {
298     DCHECK_LE(count, kGroupSize);
299     size_t s = kHeaderSize + count;
300     for (; count; --count, ++p) {
301       s += key(*p);
302     }
303     return s;
304   }
305 
306   /**
307    * Return the number of values from *p that are valid from an encoded
308    * buffer of size bytes.
309    */
partialCount(const char * p,size_t size)310   static size_t partialCount(const char* p, size_t size) {
311     uint16_t v = loadUnaligned<uint16_t>(p);
312     size_t s = kHeaderSize;
313     s += 1 + b0key(v);
314     if (s > size) {
315       return 0;
316     }
317     s += 1 + b1key(v);
318     if (s > size) {
319       return 1;
320     }
321     s += 1 + b2key(v);
322     if (s > size) {
323       return 2;
324     }
325     s += 1 + b3key(v);
326     if (s > size) {
327       return 3;
328     }
329     s += 1 + b4key(v);
330     if (s > size) {
331       return 4;
332     }
333     return 5;
334   }
335 
336   /**
337    * Given a pointer to the beginning of an GroupVarint64-encoded block,
338    * return the number of bytes used by the encoding.
339    */
encodedSize(const char * p)340   static size_t encodedSize(const char* p) {
341     uint16_t n = loadUnaligned<uint16_t>(p);
342     return kHeaderSize + kGroupSize + b0key(n) + b1key(n) + b2key(n) +
343         b3key(n) + b4key(n);
344   }
345 
346   /**
347    * Encode five uint64_t values into the buffer pointed-to by p, and return
348    * the next position in the buffer (that is, one character past the last
349    * encoded byte).  p needs to have at least size()+8 bytes available.
350    */
encode(char * p,uint64_t a,uint64_t b,uint64_t c,uint64_t d,uint64_t e)351   static char* encode(
352       char* p, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
353     uint16_t b0key = key(a);
354     uint16_t b1key = key(b);
355     uint16_t b2key = key(c);
356     uint16_t b3key = key(d);
357     uint16_t b4key = key(e);
358     storeUnaligned<uint16_t>(
359         p,
360         uint16_t(
361             (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) |
362             b0key));
363     p += 2;
364     storeUnaligned(p, a);
365     p += b0key + 1;
366     storeUnaligned(p, b);
367     p += b1key + 1;
368     storeUnaligned(p, c);
369     p += b2key + 1;
370     storeUnaligned(p, d);
371     p += b3key + 1;
372     storeUnaligned(p, e);
373     p += b4key + 1;
374     return p;
375   }
376 
377   /**
378    * Encode five uint64_t values from the array pointed-to by src into the
379    * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
380    */
encode(char * p,const uint64_t * src)381   static char* encode(char* p, const uint64_t* src) {
382     return encode(p, src[0], src[1], src[2], src[3], src[4]);
383   }
384 
385   /**
386    * Decode five uint64_t values from a buffer, and return the next position
387    * in the buffer (that is, one character past the last encoded byte).
388    * The buffer needs to have at least 7 bytes available (they may be read
389    * but ignored).
390    */
decode(const char * p,uint64_t * a,uint64_t * b,uint64_t * c,uint64_t * d,uint64_t * e)391   static const char* decode(
392       const char* p,
393       uint64_t* a,
394       uint64_t* b,
395       uint64_t* c,
396       uint64_t* d,
397       uint64_t* e) {
398     uint16_t k = loadUnaligned<uint16_t>(p);
399     p += 2;
400     uint8_t k0 = b0key(k);
401     *a = loadUnaligned<uint64_t>(p) & kMask[k0];
402     p += k0 + 1;
403     uint8_t k1 = b1key(k);
404     *b = loadUnaligned<uint64_t>(p) & kMask[k1];
405     p += k1 + 1;
406     uint8_t k2 = b2key(k);
407     *c = loadUnaligned<uint64_t>(p) & kMask[k2];
408     p += k2 + 1;
409     uint8_t k3 = b3key(k);
410     *d = loadUnaligned<uint64_t>(p) & kMask[k3];
411     p += k3 + 1;
412     uint8_t k4 = b4key(k);
413     *e = loadUnaligned<uint64_t>(p) & kMask[k4];
414     p += k4 + 1;
415     return p;
416   }
417 
418   /**
419    * Decode five uint64_t values from a buffer and store them in the array
420    * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
421    */
decode(const char * p,uint64_t * dest)422   static const char* decode(const char* p, uint64_t* dest) {
423     return decode(p, dest, dest + 1, dest + 2, dest + 3, dest + 4);
424   }
425 
426  private:
427   enum { kHeaderBytes = 2 };
428 
key(uint64_t x)429   static uint8_t key(uint64_t x) {
430     // __builtin_clzll is undefined for the x==0 case
431     return uint8_t(7 - (__builtin_clzll(x | 1) / 8));
432   }
433 
b0key(uint16_t x)434   static uint8_t b0key(uint16_t x) { return x & 7u; }
b1key(uint16_t x)435   static uint8_t b1key(uint16_t x) { return (x >> 3) & 7u; }
b2key(uint16_t x)436   static uint8_t b2key(uint16_t x) { return (x >> 6) & 7u; }
b3key(uint16_t x)437   static uint8_t b3key(uint16_t x) { return (x >> 9) & 7u; }
b4key(uint16_t x)438   static uint8_t b4key(uint16_t x) { return (x >> 12) & 7u; }
439 
440   static const uint64_t kMask[];
441 };
442 
443 typedef GroupVarint<uint32_t> GroupVarint32;
444 typedef GroupVarint<uint64_t> GroupVarint64;
445 
446 /**
447  * Simplify use of GroupVarint* for the case where data is available one
448  * entry at a time (instead of one group at a time).  Handles buffering
449  * and an incomplete last chunk.
450  *
451  * Output is a function object that accepts character ranges:
452  * out(StringPiece) appends the given character range to the output.
453  */
454 template <class T, class Output>
455 class GroupVarintEncoder {
456  public:
457   typedef GroupVarint<T> Base;
458   typedef T type;
459 
GroupVarintEncoder(Output out)460   explicit GroupVarintEncoder(Output out) : out_(out), count_(0) {}
461 
~GroupVarintEncoder()462   ~GroupVarintEncoder() { finish(); }
463 
464   /**
465    * Add a value to the encoder.
466    */
add(type val)467   void add(type val) {
468     buf_[count_++] = val;
469     if (count_ == Base::kGroupSize) {
470       char* p = Base::encode(tmp_, buf_);
471       out_(StringPiece(tmp_, p));
472       count_ = 0;
473     }
474   }
475 
476   /**
477    * Finish encoding, flushing any buffered values if necessary.
478    * After finish(), the encoder is immediately ready to encode more data
479    * to the same output.
480    */
finish()481   void finish() {
482     if (count_) {
483       // This is not strictly necessary, but it makes testing easy;
484       // uninitialized bytes are guaranteed to be recorded as taking one byte
485       // (not more).
486       for (size_t i = count_; i < Base::kGroupSize; i++) {
487         buf_[i] = 0;
488       }
489       Base::encode(tmp_, buf_);
490       out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
491       count_ = 0;
492     }
493   }
494 
495   /**
496    * Return the appender that was used.
497    */
output()498   Output& output() { return out_; }
output()499   const Output& output() const { return out_; }
500 
501   /**
502    * Reset the encoder, disregarding any state (except what was already
503    * flushed to the output, of course).
504    */
clear()505   void clear() { count_ = 0; }
506 
507  private:
508   Output out_;
509   char tmp_[Base::kMaxSize];
510   type buf_[Base::kGroupSize];
511   size_t count_;
512 };
513 
514 /**
515  * Simplify use of GroupVarint* for the case where the last group in the
516  * input may be incomplete (but the exact size of the input is known).
517  * Allows for extracting values one at a time.
518  */
519 template <typename T>
520 class GroupVarintDecoder {
521  public:
522   typedef GroupVarint<T> Base;
523   typedef T type;
524 
525   GroupVarintDecoder() = default;
526 
527   explicit GroupVarintDecoder(StringPiece data, size_t maxCount = (size_t)-1)
528       : rrest_(data.end()),
529         p_(data.data()),
530         end_(data.end()),
531         limit_(end_),
532         pos_(0),
533         count_(0),
534         remaining_(maxCount) {}
535 
536   void reset(StringPiece data, size_t maxCount = (size_t)-1) {
537     rrest_ = data.end();
538     p_ = data.data();
539     end_ = data.end();
540     limit_ = end_;
541     pos_ = 0;
542     count_ = 0;
543     remaining_ = maxCount;
544   }
545 
546   /**
547    * Read and return the next value.
548    */
next(type * val)549   bool next(type* val) {
550     if (pos_ == count_) {
551       // refill
552       size_t rem = size_t(end_ - p_);
553       if (rem == 0 || remaining_ == 0) {
554         return false;
555       }
556       // next() attempts to read one full group at a time, and so we must have
557       // at least enough bytes readable after its end to handle the case if the
558       // last group is full.
559       //
560       // The best way to ensure this is to ensure that data has at least
561       // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
562       // into a temporary buffer.
563       if (limit_ - p_ < Base::kMaxSize) {
564         memcpy(tmp_, p_, rem);
565         p_ = tmp_;
566         end_ = p_ + rem;
567         limit_ = tmp_ + sizeof(tmp_);
568       }
569       pos_ = 0;
570       const char* n = Base::decode(p_, buf_);
571       if (n <= end_) {
572         // Full group could be decoded
573         if (remaining_ >= Base::kGroupSize) {
574           remaining_ -= Base::kGroupSize;
575           count_ = Base::kGroupSize;
576           p_ = n;
577         } else {
578           count_ = remaining_;
579           remaining_ = 0;
580           p_ += Base::partialSize(buf_, count_);
581         }
582       } else {
583         // Can't decode a full group
584         count_ = Base::partialCount(p_, size_t(end_ - p_));
585         if (remaining_ >= count_) {
586           remaining_ -= count_;
587           p_ = end_;
588         } else {
589           count_ = remaining_;
590           remaining_ = 0;
591           p_ += Base::partialSize(buf_, count_);
592         }
593         if (count_ == 0) {
594           return false;
595         }
596       }
597     }
598     *val = buf_[pos_++];
599     return true;
600   }
601 
rest()602   StringPiece rest() const {
603     // This is only valid after next() returned false
604     CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
605     // p_ may point to the internal buffer (tmp_), but we want
606     // to return subpiece of the original data
607     size_t size = size_t(end_ - p_);
608     return StringPiece(rrest_ - size, rrest_);
609   }
610 
611  private:
612   const char* rrest_;
613   const char* p_;
614   const char* end_;
615   const char* limit_;
616   char tmp_[2 * Base::kMaxSize];
617   type buf_[Base::kGroupSize];
618   size_t pos_;
619   size_t count_;
620   size_t remaining_;
621 };
622 
623 typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
624 typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
625 
626 } // namespace folly
627 
628 #endif // FOLLY_HAVE_GROUP_VARINT
629