1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Rice-Golomb decoder for blacklist updates.
6 // Details at: https://en.wikipedia.org/wiki/Golomb_coding
7 
8 #ifndef COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_
9 #define COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_
10 
11 #include <ostream>
12 #include <string>
13 #include <vector>
14 #include "base/gtest_prod_util.h"
15 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
16 
17 namespace safe_browsing {
18 
19 // Enumerate different failure events while decoding the Rice-encoded string
20 // sent by the server for histogramming purposes. DO NOT CHANGE THE ORDERING OF
21 // THESE VALUES.
22 enum V4DecodeResult {
23   // No error.
24   DECODE_SUCCESS = 0,
25 
26   // Exceeded the number of entries to expect.
27   DECODE_NO_MORE_ENTRIES_FAILURE = 1,
28 
29   // Requested to decode >32 bits.
30   DECODE_REQUESTED_TOO_MANY_BITS_FAILURE = 2,
31 
32   // All bits had already been read and interpreted in the encoded string.
33   DECODE_RAN_OUT_OF_BITS_FAILURE = 3,
34 
35   // The num_entries argument to DecodePrefixes or DecodeIntegers was negative.
36   NUM_ENTRIES_NEGATIVE_FAILURE = 4,
37 
38   // Rice-encoding parameter was non-positive when the number of encoded entries
39   // was > 0.
40   RICE_PARAMETER_NON_POSITIVE_FAILURE = 5,
41 
42   // |encoded_data| was empty when the number of encoded entries was > 0.
43   ENCODED_DATA_UNEXPECTED_EMPTY_FAILURE = 6,
44 
45   // decoded value had an integer overflow, which is unexpected.
46   DECODED_INTEGER_OVERFLOW_FAILURE = 7,
47 
48   // Memory space for histograms is determined by the max.  ALWAYS
49   // ADD NEW VALUES BEFORE THIS ONE.
50   DECODE_RESULT_MAX
51 };
52 
53 class V4RiceDecoder {
54  public:
55   // Decodes the Rice-encoded string in |encoded_data| as a list of integers
56   // and stores them in |out|. |rice_parameter| is the exponent of 2 for
57   // calculating 'M', |first_value| is the first value in the output sequence,
58   // |num_entries| is the number of subsequent encoded entries. Each decoded
59   // value is a positive offset from the previous value.
60   // So, for instance, if the unencoded sequence is: [3, 7, 25], then
61   // |first_value| = 3, |num_entries| = 2 and decoding the |encoded_data| will
62   // produce the offsets: [4, 18].
63   static V4DecodeResult DecodeIntegers(
64       const ::google::protobuf::int64 first_value,
65       const ::google::protobuf::int32 rice_parameter,
66       const ::google::protobuf::int32 num_entries,
67       const std::string& encoded_data,
68       ::google::protobuf::RepeatedField<::google::protobuf::int32>* out);
69 
70   // Decodes the Rice-encoded string in |encoded_data| as a string of 4-byte
71   // hash prefixes and stores them in |out|. The rest of the arguments are the
72   // same as for |DecodeIntegers|.
73   // Important: |out| is only meant to be used as a concatenated list of sorted
74   // 4-byte hash prefixes, not as a vector of uint32_t values.
75   // This method does the following:
76   // 1. Rice-decode the |encoded_data| as a list of uint32_t values.
77   // 2. Flip the endianness (on little-endian machines) of each of these
78   //    values. This is done because when a hash prefix is represented as a
79   //    uint32_t, the bytes get reordered. This generates the hash prefix that
80   //    the server would have sent in the absence of Rice-encoding.
81   // 3. Sort the resulting list of uint32_t values.
82   // 4. Flip the endianness once again since the uint32_t are expected to be
83   //    consumed as a concatenated list of 4-byte hash prefixes, when merging
84   //    the
85   //    update with the existing state.
86   static V4DecodeResult DecodePrefixes(
87       const ::google::protobuf::int64 first_value,
88       const ::google::protobuf::int32 rice_parameter,
89       const ::google::protobuf::int32 num_entries,
90       const std::string& encoded_data,
91       std::vector<uint32_t>* out);
92 
93   virtual ~V4RiceDecoder();
94 
95   std::string DebugString() const;
96 
97  private:
98   FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextWordWithNoData);
99   FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextBitsWithNoData);
100   FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoData);
101   FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoEntries);
102   friend class V4RiceTest;
103 
104   // Validate some of the parameters passed to the decode methods.
105   static V4DecodeResult ValidateInput(
106       const ::google::protobuf::int32 rice_parameter,
107       const ::google::protobuf::int32 num_entries,
108       const std::string& encoded_data);
109 
110   // The |rice_parameter| is the exponent of 2 for calculating 'M',
111   // |num_entries| is the number of encoded entries in the |encoded_data| and
112   // |encoded_data| is the Rice-encoded string to decode.
113   V4RiceDecoder(const ::google::protobuf::int32 rice_parameter,
114                 const ::google::protobuf::int32 num_entries,
115                 const std::string& encoded_data);
116 
117   // Returns true until |num_entries| entries have been decoded.
118   bool HasAnotherValue() const;
119 
120   // Populates |value| with the next 32-bit unsigned integer decoded from
121   // |encoded_data|.
122   V4DecodeResult GetNextValue(uint32_t* value);
123 
124   // Reads in up to 32 bits from |encoded_data| into |word|, from which
125   // subsequent GetNextBits() calls read bits.
126   V4DecodeResult GetNextWord(uint32_t* word);
127 
128   // Reads |num_requested_bits| into |x| from |current_word_| and advances it
129   // if needed by calling GetNextWord().
130   V4DecodeResult GetNextBits(unsigned int num_requested_bits, uint32_t* x);
131 
132   // Reads |num_requested_bits| from |current_word_|.
133   uint32_t GetBitsFromCurrentWord(unsigned int num_requested_bits);
134 
135   // The Rice parameter, which is the exponent of two for calculating 'M'. 'M'
136   // is used as the base to calculate the quotient and remainder in the
137   // algorithm.
138   const unsigned int rice_parameter_;
139 
140   // The number of entries encoded in the data stream.
141   ::google::protobuf::int32 num_entries_;
142 
143   // The Rice-encoded string.
144   const std::string data_;
145 
146   // Represents how many total bytes have we read from |data_| into
147   // |current_word_|.
148   unsigned int data_byte_index_;
149 
150   // Represents the number of bits that we have read from |current_word_|. When
151   // this becomes 32, which is the size of |current_word_|, a new
152   // |current_word_| needs to be read from |data_|.
153   unsigned int current_word_bit_index_;
154 
155   // The 32-bit value read from |data_|. All bit reading operations operate on
156   // |current_word_|.
157   uint32_t current_word_;
158 };
159 
160 std::ostream& operator<<(std::ostream& os, const V4RiceDecoder& rice_decoder);
161 
162 }  // namespace safe_browsing
163 
164 #endif  // COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_
165