1 // Copyright 2016 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Rice-Golomb decoder for blacklist updates. 6 // Details at: https://en.wikipedia.org/wiki/Golomb_coding 7 8 #ifndef COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_ 9 #define COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_ 10 11 #include <ostream> 12 #include <string> 13 #include <vector> 14 #include "base/gtest_prod_util.h" 15 #include "third_party/protobuf/src/google/protobuf/repeated_field.h" 16 17 namespace safe_browsing { 18 19 // Enumerate different failure events while decoding the Rice-encoded string 20 // sent by the server for histogramming purposes. DO NOT CHANGE THE ORDERING OF 21 // THESE VALUES. 22 enum V4DecodeResult { 23 // No error. 24 DECODE_SUCCESS = 0, 25 26 // Exceeded the number of entries to expect. 27 DECODE_NO_MORE_ENTRIES_FAILURE = 1, 28 29 // Requested to decode >32 bits. 30 DECODE_REQUESTED_TOO_MANY_BITS_FAILURE = 2, 31 32 // All bits had already been read and interpreted in the encoded string. 33 DECODE_RAN_OUT_OF_BITS_FAILURE = 3, 34 35 // The num_entries argument to DecodePrefixes or DecodeIntegers was negative. 36 NUM_ENTRIES_NEGATIVE_FAILURE = 4, 37 38 // Rice-encoding parameter was non-positive when the number of encoded entries 39 // was > 0. 40 RICE_PARAMETER_NON_POSITIVE_FAILURE = 5, 41 42 // |encoded_data| was empty when the number of encoded entries was > 0. 43 ENCODED_DATA_UNEXPECTED_EMPTY_FAILURE = 6, 44 45 // decoded value had an integer overflow, which is unexpected. 46 DECODED_INTEGER_OVERFLOW_FAILURE = 7, 47 48 // Memory space for histograms is determined by the max. ALWAYS 49 // ADD NEW VALUES BEFORE THIS ONE. 50 DECODE_RESULT_MAX 51 }; 52 53 class V4RiceDecoder { 54 public: 55 // Decodes the Rice-encoded string in |encoded_data| as a list of integers 56 // and stores them in |out|. |rice_parameter| is the exponent of 2 for 57 // calculating 'M', |first_value| is the first value in the output sequence, 58 // |num_entries| is the number of subsequent encoded entries. Each decoded 59 // value is a positive offset from the previous value. 60 // So, for instance, if the unencoded sequence is: [3, 7, 25], then 61 // |first_value| = 3, |num_entries| = 2 and decoding the |encoded_data| will 62 // produce the offsets: [4, 18]. 63 static V4DecodeResult DecodeIntegers( 64 const ::google::protobuf::int64 first_value, 65 const ::google::protobuf::int32 rice_parameter, 66 const ::google::protobuf::int32 num_entries, 67 const std::string& encoded_data, 68 ::google::protobuf::RepeatedField<::google::protobuf::int32>* out); 69 70 // Decodes the Rice-encoded string in |encoded_data| as a string of 4-byte 71 // hash prefixes and stores them in |out|. The rest of the arguments are the 72 // same as for |DecodeIntegers|. 73 // Important: |out| is only meant to be used as a concatenated list of sorted 74 // 4-byte hash prefixes, not as a vector of uint32_t values. 75 // This method does the following: 76 // 1. Rice-decode the |encoded_data| as a list of uint32_t values. 77 // 2. Flip the endianness (on little-endian machines) of each of these 78 // values. This is done because when a hash prefix is represented as a 79 // uint32_t, the bytes get reordered. This generates the hash prefix that 80 // the server would have sent in the absence of Rice-encoding. 81 // 3. Sort the resulting list of uint32_t values. 82 // 4. Flip the endianness once again since the uint32_t are expected to be 83 // consumed as a concatenated list of 4-byte hash prefixes, when merging 84 // the 85 // update with the existing state. 86 static V4DecodeResult DecodePrefixes( 87 const ::google::protobuf::int64 first_value, 88 const ::google::protobuf::int32 rice_parameter, 89 const ::google::protobuf::int32 num_entries, 90 const std::string& encoded_data, 91 std::vector<uint32_t>* out); 92 93 virtual ~V4RiceDecoder(); 94 95 std::string DebugString() const; 96 97 private: 98 FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextWordWithNoData); 99 FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextBitsWithNoData); 100 FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoData); 101 FRIEND_TEST_ALL_PREFIXES(V4RiceTest, TestDecoderGetNextValueWithNoEntries); 102 friend class V4RiceTest; 103 104 // Validate some of the parameters passed to the decode methods. 105 static V4DecodeResult ValidateInput( 106 const ::google::protobuf::int32 rice_parameter, 107 const ::google::protobuf::int32 num_entries, 108 const std::string& encoded_data); 109 110 // The |rice_parameter| is the exponent of 2 for calculating 'M', 111 // |num_entries| is the number of encoded entries in the |encoded_data| and 112 // |encoded_data| is the Rice-encoded string to decode. 113 V4RiceDecoder(const ::google::protobuf::int32 rice_parameter, 114 const ::google::protobuf::int32 num_entries, 115 const std::string& encoded_data); 116 117 // Returns true until |num_entries| entries have been decoded. 118 bool HasAnotherValue() const; 119 120 // Populates |value| with the next 32-bit unsigned integer decoded from 121 // |encoded_data|. 122 V4DecodeResult GetNextValue(uint32_t* value); 123 124 // Reads in up to 32 bits from |encoded_data| into |word|, from which 125 // subsequent GetNextBits() calls read bits. 126 V4DecodeResult GetNextWord(uint32_t* word); 127 128 // Reads |num_requested_bits| into |x| from |current_word_| and advances it 129 // if needed by calling GetNextWord(). 130 V4DecodeResult GetNextBits(unsigned int num_requested_bits, uint32_t* x); 131 132 // Reads |num_requested_bits| from |current_word_|. 133 uint32_t GetBitsFromCurrentWord(unsigned int num_requested_bits); 134 135 // The Rice parameter, which is the exponent of two for calculating 'M'. 'M' 136 // is used as the base to calculate the quotient and remainder in the 137 // algorithm. 138 const unsigned int rice_parameter_; 139 140 // The number of entries encoded in the data stream. 141 ::google::protobuf::int32 num_entries_; 142 143 // The Rice-encoded string. 144 const std::string data_; 145 146 // Represents how many total bytes have we read from |data_| into 147 // |current_word_|. 148 unsigned int data_byte_index_; 149 150 // Represents the number of bits that we have read from |current_word_|. When 151 // this becomes 32, which is the size of |current_word_|, a new 152 // |current_word_| needs to be read from |data_|. 153 unsigned int current_word_bit_index_; 154 155 // The 32-bit value read from |data_|. All bit reading operations operate on 156 // |current_word_|. 157 uint32_t current_word_; 158 }; 159 160 std::ostream& operator<<(std::ostream& os, const V4RiceDecoder& rice_decoder); 161 162 } // namespace safe_browsing 163 164 #endif // COMPONENTS_SAFE_BROWSING_CORE_DB_V4_RICE_H_ 165