1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_
6 #define THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_
7 
8 #include <array>
9 
10 #include "base/containers/span.h"
11 #include "base/strings/string_piece.h"
12 #include "base/sys_byteorder.h"
13 #include "third_party/blink/public/common/common_export.h"
14 #include "third_party/blink/public/common/privacy_budget/identifiability_internal_templates.h"
15 #include "third_party/blink/public/common/privacy_budget/identifiable_token.h"
16 
17 namespace blink {
18 
19 // Builds an IdentifiableToken incrementally.
20 //
21 // Use this when the input to a sample is a bunch of disjoint objects, or the
22 // sample needs to include objects that are incrementally encountered.
23 //
24 // Notes:
25 //   * The digest returned by this class is *NOT* the same as the one
26 //     IdentifiabilityDigestOfBytes for the same set of bytes. This is due to
27 //     block based chaining of digests used by this class.
28 //     IdentifiabilityDigestOfBytes and this class are *NOT* interchangeable.
29 //
30 //     TODO(asanka): IdentifiabilityDigestOfBytes() and this class should
31 //     interop better. Perhaps by making the latter use the former.
32 //
33 //   * The digest returned by this class is *NOT* the same as what you would
34 //     acquire by invoking IdentifiableToken() over the same object.
35 //     IdentifiableToken() and this class are *NOT* interchangeable.
36 //
37 //   * The digest returned by this class only depends on the cumulative sequence
38 //     of bytes that are fed to it. The partitioning thereof is irrelevant.
39 //
40 //   * This object never finalizes. Partial digests can be extracted at any
41 //     point.
42 class BLINK_COMMON_EXPORT IdentifiableTokenBuilder {
43  public:
44   // Convenient alias for a span of const uint8_t.
45   using ByteSpan = IdentifiableToken::ByteSpan;
46 
47   // Initializes an "empty" incremental digest for the purpose of constructing
48   // an identifiability sample.
49   IdentifiableTokenBuilder();
50 
51   // Initializes an incremental digest and populates it with the data contained
52   // in |message|.
53   explicit IdentifiableTokenBuilder(ByteSpan message);
54 
55   // Copies the intermediate state.
56   IdentifiableTokenBuilder(const IdentifiableTokenBuilder&);
57 
58   // Feeds data contained in |buffer| to the digest.
59   IdentifiableTokenBuilder& AddBytes(ByteSpan buffer);
60 
61   // Feeds data contained in |buffer| to the digest, but precedes the buffer
62   // contents with an integer indicating the length. Use this when:
63   //
64   //   * |buffer| is atomic. I.e. it will always be added as a single buffer.
65   //
66   //   * The boundary between |buffer| and adjacent objects cannot be uniquely
67   //     established based on content.
68   //
69   // E.g.: Ignoring NUL terminators, the pair of strings "abcd", "efgh" will be
70   //       assigned token as the strings "abcdefg", "h" if both are added
71   //       individually via AddBytes(). But they will have distinct digests if
72   //       added via AddAtomic().
73   //
74   // If the contents of the object cannot be specified in a contiguous span of
75   // memory, then consider adding a length directly via AddValue() prior to
76   // adding the contents of the buffer. Doing so will achieve the same ends as
77   // AddAtomic().
78   IdentifiableTokenBuilder& AddAtomic(ByteSpan buffer);
AddAtomic(base::StringPiece string)79   IdentifiableTokenBuilder& AddAtomic(base::StringPiece string) {
80     return AddAtomic(base::as_bytes(base::make_span(string)));
81   }
82 
83   // Feeds the underlying value of the |token| itself to the digest. Use this
84   // when |token| is computed in parallel in order to preserve the ordering of
85   // values that were seen in a concurrent sequence that cannot be
86   // deterministically interleaved into the primary stream.
87   IdentifiableTokenBuilder& AddToken(IdentifiableToken token);
88 
89   // Helper for feeding primitive types by value efficiently. Anything more
90   // complicated than that should be passed in as a base::span<const uint8_t>.
91   //
92   // Adds eight bytes to the digest. If the type of the value doesn't consume
93   // all of the bytes, pads the remainder with NUL bytes.
94   template <typename T,
95             typename std::enable_if_t<
96                 std::is_same<T, internal::remove_cvref_t<T>>::value &&
97                 internal::has_unique_object_representations<T>::value &&
98                 sizeof(T) <= sizeof(uint64_t)>* = nullptr>
AddValue(T in)99   IdentifiableTokenBuilder& AddValue(T in) {
100     AlignPartialBuffer();
101     int64_t clean_buffer =
102         base::ByteSwapToLE64(internal::DigestOfObjectRepresentation(in));
103     return AddBytes(base::make_span(
104         reinterpret_cast<const uint8_t*>(&clean_buffer), sizeof(clean_buffer)));
105   }
106 
107   // Conversion operator captures an intermediate digest.
108   //
109   // The sample captures all the data that's been fed into the digest so far,
110   // but doesn't finalize the digest. It is valid to continue adding data after
111   // constructing an intermediate sample.
112   //
113   // (google-explicit-constructor also flags user-defined conversion operators.)
114   // NOLINTNEXTLINE(google-explicit-constructor)
115   operator IdentifiableToken() const;
116 
117   // Captures an intermediate digest.
118   //
119   // The sample captures all the data that's been fed into the digest so far,
120   // but doesn't finalize the digest. It is valid to continue adding data after
121   // constructing an intermediate sample.
122   IdentifiableToken GetToken() const;
123 
124   // No comparisons.
125   bool operator==(const IdentifiableTokenBuilder&) const = delete;
126   bool operator<(const IdentifiableTokenBuilder&) const = delete;
127 
128  private:
129   // Block size. Must be a multiple of 64. Higher block sizes consume more
130   // memory. The extra cost is unlikely to be worth it.
131   //
132   // Under the covers we use CityHash64. It can pretty efficiently digest
133   // 64-byte blocks.
134   static constexpr size_t kBlockSizeInBytes = 64;
135 
136   // Target alignment for new buffers. This is set to 8 for all platforms and
137   // must always stay constant across platforms.
138   static constexpr size_t kBlockAlignment = 8;
139 
140   // An array of exactly |kBlockSizeInBytes| bytes.
141   using BlockBuffer = std::array<uint8_t, kBlockSizeInBytes>;
142 
143   // A view of a full block.
144   using ConstFullBlockSpan = base::span<const uint8_t, kBlockSizeInBytes>;
145 
146   // Returns true if the partial buffer is aligned on |kBlockAlignment|
147   // boundary.
148   bool IsAligned() const;
149 
150   // Appends enough NUL bytes to |partial_| until the next insertion point is
151   // aligned on a |kBlockAlignment| boundary.
152   //
153   // If the partial buffer is non-empty, its size is unlikely to be aligned at
154   // machine word boundary. This makes subsequent append operations slow for
155   // data types that are already aligned.
156   //
157   // This should only be called prior to adding an atomic buffer.
158   void AlignPartialBuffer();
159 
160   // Captures the |kBlockSizeInBytes| bytes of data in |block| into the digest.
161   // |block| must be exactly this many bytes.
162   void DigestBlock(ConstFullBlockSpan block);
163 
164   // Captures as many bytes as possible from |message| into the partial block in
165   // |partial_|. It captures a maximum of |kBlockSizeInBytes - 1| bytes.
166   //
167   // Returns a span covering the remainder of |message| that was not consumed.
168   ByteSpan SkimIntoPartial(ByteSpan message);
169 
170   // Returns a span for the contents of the partial block.
171   //
172   // Can be called at any point. Does not change the state of the partial
173   // buffer.
174   ByteSpan GetPartialBlock() const;
175 
176   // Returns a span that includes the contents of the partial block and backed
177   // by |partial_|.
178   //
179   // NOTE: Should only be called once |kBlockSizeInBytes| bytes have been
180   // accumulated. Resets |partial_size_| upon completion.
181   //
182   // NOTE: Any subsequent AddBytes(), AddValue(), AddAtomic() calls will
183   // invalidate the returned FullBlock.
184   ConstFullBlockSpan TakeCompletedBlock();
185 
186   // Size of partially filled buffer.
187   size_t PartialSize() const;
188 
189   // Accumulates smaller pieces of data until we have a full block.
190   alignas(int64_t) BlockBuffer partial_;
191 
192   // Next available position in `partial_`. std::array iterators are never
193   // invalidated.
194   BlockBuffer::iterator position_ = partial_.begin();
195 
196   // Merkle-Damgård chaining.
197   uint64_t chaining_value_;
198 };
199 
200 }  // namespace blink
201 
202 #endif  // THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_
203