1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdint>
20 #include <cstring>
21 #include <utility>
22 
23 #include <folly/Likely.h>
24 #include <folly/io/Cursor.h>
25 
26 #include <thrift/lib/cpp/protocol/TProtocolException.h>
27 #include <thrift/lib/cpp2/protocol/nimble/ControlBitHelpers.h>
28 #include <thrift/lib/cpp2/protocol/nimble/DecodeNimbleBlock.h>
29 
30 namespace apache {
31 namespace thrift {
32 namespace detail {
33 
34 struct BufferingNimbleDecoderState {
35  public:
36   BufferingNimbleDecoderState() = default;
37   BufferingNimbleDecoderState(const BufferingNimbleDecoderState&) = delete;
BufferingNimbleDecoderStateBufferingNimbleDecoderState38   BufferingNimbleDecoderState(BufferingNimbleDecoderState&& other) noexcept {
39     nextChunkToReturn_ = std::exchange(other.nextChunkToReturn_, -1);
40   }
41   BufferingNimbleDecoderState& operator=(const BufferingNimbleDecoderState&) =
42       delete;
43   BufferingNimbleDecoderState& operator=(
44       BufferingNimbleDecoderState&& other) noexcept {
45     nextChunkToReturn_ = std::exchange(other.nextChunkToReturn_, -1);
46     return *this;
47   }
48 
49  private:
50   template <ChunkRepr repr>
51   friend class BufferingNimbleDecoder;
52 
BufferingNimbleDecoderStateBufferingNimbleDecoderState53   explicit BufferingNimbleDecoderState(ssize_t nextChunkToReturn) noexcept
54       : nextChunkToReturn_(nextChunkToReturn) {}
55 
56   ssize_t nextChunkToReturn_ = -1;
57 };
58 
59 // The buffering decoder serves two purposes:
60 // 1. Provides chunk-at-a-time semantics. Naively, lower level decoders only
61 //    work in blocks of 4 chunks (because each control stream byte maps to 4
62 //    chunks). But often times, consumer classes will want just one or two
63 //    chunks, and don't want to have to deal with bookkeeping the extra ones.
64 // 2. It decodes more than 1 block at a time, as an optimization. (We can save
65 //    some boundary calculation, as well as do some cache locality tuning).
66 template <ChunkRepr repr>
67 class BufferingNimbleDecoder {
68  public:
69   BufferingNimbleDecoder() = default;
70   ~BufferingNimbleDecoder() = default;
71   BufferingNimbleDecoder(const BufferingNimbleDecoder&) = delete;
72   BufferingNimbleDecoder& operator=(const BufferingNimbleDecoder&) = delete;
73 
setControlInput(const folly::io::Cursor & cursor)74   void setControlInput(const folly::io::Cursor& cursor) {
75     controlCursor_ = cursor;
76   }
77 
setDataInput(const folly::io::Cursor & cursor)78   void setDataInput(const folly::io::Cursor& cursor) { dataCursor_ = cursor; }
79 
borrowState()80   BufferingNimbleDecoderState borrowState() {
81     if (folly::kIsDebug) {
82       DCHECK(!stateBorrowed_);
83       stateBorrowed_ = true;
84     }
85     return BufferingNimbleDecoderState{nextChunkToReturn_};
86   }
87 
returnState(BufferingNimbleDecoderState state)88   void returnState(BufferingNimbleDecoderState state) {
89     DCHECK(state.nextChunkToReturn_ != -1);
90 
91     nextChunkToReturn_ = state.nextChunkToReturn_;
92 
93     if (folly::kIsDebug) {
94       DCHECK(stateBorrowed_);
95       stateBorrowed_ = false;
96       state.nextChunkToReturn_ = -1;
97     }
98   }
99 
nextChunk()100   std::uint32_t nextChunk() {
101     DCHECK(!stateBorrowed_);
102     auto state = borrowState();
103     std::uint32_t ret = nextChunk(state);
104     returnState(std::move(state));
105     return ret;
106   }
107 
nextChunk(BufferingNimbleDecoderState & state)108   std::uint32_t nextChunk(BufferingNimbleDecoderState& state) {
109     DCHECK(stateBorrowed_);
110     DCHECK(state.nextChunkToReturn_ != -1);
111 
112     if (UNLIKELY(state.nextChunkToReturn_ == maxChunkFilled_)) {
113       fillBuffer();
114       state = BufferingNimbleDecoderState{0};
115     }
116     std::uint32_t result = chunks_[state.nextChunkToReturn_];
117     ++state.nextChunkToReturn_;
118     return result;
119   }
120 
121  private:
fillBuffer()122   void fillBuffer() {
123     maxChunkFilled_ = 0;
124     // We were asked to produce more chunks, but can't advance in the control
125     // stream; the input is invalid. Note that a similar check on the data
126     // stream wouldn't be correct; a control byte may correspond to 0 data
127     // bytes. Instead, we check that we didn't overflow manually, in the slow
128     // path below.
129     if (!controlCursor_.canAdvance(1)) {
130       protocol::TProtocolException::throwTruncatedData();
131     }
132     while (maxChunkFilled_ < kChunksToBuffer && controlCursor_.canAdvance(1)) {
133       ssize_t minControlBytes = controlCursor_.length();
134       ssize_t minBlocks = dataCursor_.length() / kMaxBytesPerBlock;
135       ssize_t headRoom = (kChunksToBuffer - maxChunkFilled_) / kChunksPerBlock;
136       ssize_t numUnconditionalIters =
137           std::min({minControlBytes, minBlocks, headRoom});
138       ssize_t dataBytesConsumed = 0;
139       if (UNLIKELY(numUnconditionalIters == 0)) {
140         // Slow path where we cross IOBufs.
141         std::uint8_t controlByte = controlCursor_.read<std::uint8_t>();
142         std::array<unsigned char, kMaxBytesPerBlock> buf;
143         size_t bytesPulled =
144             dataCursor_.pullAtMost(buf.data(), kMaxBytesPerBlock);
145         if (LIKELY(bytesPulled == kMaxBytesPerBlock)) {
146           // We're at the end of the current IOBuf, but not necessarily at the
147           // end of the whole chain.
148           size_t bytesDecoded = decodeNimbleBlock<repr>(
149               controlByte, folly::ByteRange(buf), &chunks_[maxChunkFilled_]);
150           dataCursor_.retreat(bytesPulled - bytesDecoded);
151           maxChunkFilled_ += kChunksPerBlock;
152         } else {
153           std::memset(&buf[bytesPulled], 0, kMaxBytesPerBlock - bytesPulled);
154           size_t bytesDecoded = decodeNimbleBlock<repr>(
155               controlByte, folly::ByteRange(buf), &chunks_[maxChunkFilled_]);
156           if (bytesDecoded > bytesPulled) {
157             protocol::TProtocolException::throwTruncatedData();
158           }
159           dataCursor_.retreat(bytesPulled - bytesDecoded);
160           maxChunkFilled_ += kChunksPerBlock;
161         }
162       } else {
163         // Otherwise, we iterate through until we hit the end or run out of
164         // buffer.
165 
166         // We do a little dance where we move values into temporaries, run the
167         // loop, and then move them back. This lets the compiler keep the values
168         // in registers, avoiding having to materialize the memory operations;
169         // this would not be a correct optimization otherwise (what if the
170         // buffers aliased *this?).
171         ssize_t maxChunkFilledTemp = maxChunkFilled_;
172         const std::uint8_t* controlBufTemp = controlCursor_.data();
173         const std::uint8_t* dataBufTemp = dataCursor_.data();
174         for (ssize_t i = 0; i < numUnconditionalIters; ++i) {
175           std::uint8_t controlByte = controlBufTemp[i];
176           dataBytesConsumed += decodeNimbleBlock<repr>(
177               controlByte,
178               // Note that more obvious thing, of subranging a peekBytes()
179               // result, unfortunately has a cost here; it introduces at least
180               // an extra branch per block.
181               folly::ByteRange(
182                   dataBufTemp + dataBytesConsumed,
183                   dataBufTemp + dataBytesConsumed + kMaxBytesPerBlock),
184               &chunks_[maxChunkFilledTemp]);
185           maxChunkFilledTemp += kChunksPerBlock;
186         }
187         maxChunkFilled_ = maxChunkFilledTemp;
188         controlCursor_.skip(numUnconditionalIters);
189         dataCursor_.skip(dataBytesConsumed);
190       }
191     }
192   }
193 
194   // Low level decode works 4 chunks at a time.
195   const static int kChunksToBuffer = 256;
196   static_assert(
197       kChunksToBuffer % kChunksPerBlock == 0,
198       "Must not have partially decoded blocks");
199 
200   std::uint32_t chunks_[kChunksToBuffer];
201   bool stateBorrowed_ = false;
202   ssize_t nextChunkToReturn_ = 0;
203   ssize_t maxChunkFilled_ = 0;
204 
205   folly::io::Cursor controlCursor_{nullptr};
206   folly::io::Cursor dataCursor_{nullptr};
207 };
208 
209 } // namespace detail
210 } // namespace thrift
211 } // namespace apache
212