1 /*
2  *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h"
12 
13 #include <cstddef>
14 #include <cstdint>
15 #include <utility>
16 #include <vector>
17 
18 #include "absl/base/macros.h"
19 #include "absl/types/optional.h"
20 #include "absl/types/variant.h"
21 #include "common_video/h264/h264_common.h"
22 #include "common_video/h264/pps_parser.h"
23 #include "common_video/h264/sps_parser.h"
24 #include "common_video/h264/sps_vui_rewriter.h"
25 #include "modules/rtp_rtcp/source/byte_io.h"
26 #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h"
27 #include "rtc_base/checks.h"
28 #include "rtc_base/copy_on_write_buffer.h"
29 #include "rtc_base/logging.h"
30 
31 namespace webrtc {
32 namespace {
33 
34 constexpr size_t kNalHeaderSize = 1;
35 constexpr size_t kFuAHeaderSize = 2;
36 constexpr size_t kLengthFieldSize = 2;
37 constexpr size_t kStapAHeaderSize = kNalHeaderSize + kLengthFieldSize;
38 
39 // Bit masks for FU (A and B) indicators.
40 enum NalDefs : uint8_t { kFBit = 0x80, kNriMask = 0x60, kTypeMask = 0x1F };
41 
42 // Bit masks for FU (A and B) headers.
43 enum FuDefs : uint8_t { kSBit = 0x80, kEBit = 0x40, kRBit = 0x20 };
44 
45 // TODO(pbos): Avoid parsing this here as well as inside the jitter buffer.
ParseStapAStartOffsets(const uint8_t * nalu_ptr,size_t length_remaining,std::vector<size_t> * offsets)46 bool ParseStapAStartOffsets(const uint8_t* nalu_ptr,
47                             size_t length_remaining,
48                             std::vector<size_t>* offsets) {
49   size_t offset = 0;
50   while (length_remaining > 0) {
51     // Buffer doesn't contain room for additional nalu length.
52     if (length_remaining < sizeof(uint16_t))
53       return false;
54     uint16_t nalu_size = ByteReader<uint16_t>::ReadBigEndian(nalu_ptr);
55     nalu_ptr += sizeof(uint16_t);
56     length_remaining -= sizeof(uint16_t);
57     if (nalu_size > length_remaining)
58       return false;
59     nalu_ptr += nalu_size;
60     length_remaining -= nalu_size;
61 
62     offsets->push_back(offset + kStapAHeaderSize);
63     offset += kLengthFieldSize + nalu_size;
64   }
65   return true;
66 }
67 
ProcessStapAOrSingleNalu(rtc::CopyOnWriteBuffer rtp_payload)68 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ProcessStapAOrSingleNalu(
69     rtc::CopyOnWriteBuffer rtp_payload) {
70   const uint8_t* const payload_data = rtp_payload.cdata();
71   absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
72       absl::in_place);
73   bool modified_buffer = false;
74   parsed_payload->video_payload = rtp_payload;
75   parsed_payload->video_header.width = 0;
76   parsed_payload->video_header.height = 0;
77   parsed_payload->video_header.codec = kVideoCodecH264;
78   parsed_payload->video_header.simulcastIdx = 0;
79   parsed_payload->video_header.is_first_packet_in_frame = true;
80   auto& h264_header = parsed_payload->video_header.video_type_header
81                           .emplace<RTPVideoHeaderH264>();
82 
83   const uint8_t* nalu_start = payload_data + kNalHeaderSize;
84   const size_t nalu_length = rtp_payload.size() - kNalHeaderSize;
85   uint8_t nal_type = payload_data[0] & kTypeMask;
86   std::vector<size_t> nalu_start_offsets;
87   if (nal_type == H264::NaluType::kStapA) {
88     // Skip the StapA header (StapA NAL type + length).
89     if (rtp_payload.size() <= kStapAHeaderSize) {
90       RTC_LOG(LS_ERROR) << "StapA header truncated.";
91       return absl::nullopt;
92     }
93 
94     if (!ParseStapAStartOffsets(nalu_start, nalu_length, &nalu_start_offsets)) {
95       RTC_LOG(LS_ERROR) << "StapA packet with incorrect NALU packet lengths.";
96       return absl::nullopt;
97     }
98 
99     h264_header.packetization_type = kH264StapA;
100     nal_type = payload_data[kStapAHeaderSize] & kTypeMask;
101   } else {
102     h264_header.packetization_type = kH264SingleNalu;
103     nalu_start_offsets.push_back(0);
104   }
105   h264_header.nalu_type = nal_type;
106   parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
107 
108   nalu_start_offsets.push_back(rtp_payload.size() +
109                                kLengthFieldSize);  // End offset.
110   for (size_t i = 0; i < nalu_start_offsets.size() - 1; ++i) {
111     size_t start_offset = nalu_start_offsets[i];
112     // End offset is actually start offset for next unit, excluding length field
113     // so remove that from this units length.
114     size_t end_offset = nalu_start_offsets[i + 1] - kLengthFieldSize;
115     if (end_offset - start_offset < H264::kNaluTypeSize) {
116       RTC_LOG(LS_ERROR) << "STAP-A packet too short";
117       return absl::nullopt;
118     }
119 
120     NaluInfo nalu;
121     nalu.type = payload_data[start_offset] & kTypeMask;
122     nalu.sps_id = -1;
123     nalu.pps_id = -1;
124     start_offset += H264::kNaluTypeSize;
125 
126     switch (nalu.type) {
127       case H264::NaluType::kSps: {
128         // Check if VUI is present in SPS and if it needs to be modified to
129         // avoid
130         // excessive decoder latency.
131 
132         // Copy any previous data first (likely just the first header).
133         rtc::Buffer output_buffer;
134         if (start_offset)
135           output_buffer.AppendData(payload_data, start_offset);
136 
137         absl::optional<SpsParser::SpsState> sps;
138 
139         SpsVuiRewriter::ParseResult result = SpsVuiRewriter::ParseAndRewriteSps(
140             &payload_data[start_offset], end_offset - start_offset, &sps,
141             nullptr, &output_buffer, SpsVuiRewriter::Direction::kIncoming);
142 
143         if (result == SpsVuiRewriter::ParseResult::kVuiRewritten) {
144           if (modified_buffer) {
145             RTC_LOG(LS_WARNING)
146                 << "More than one H264 SPS NAL units needing "
147                    "rewriting found within a single STAP-A packet. "
148                    "Keeping the first and rewriting the last.";
149           }
150 
151           // Rewrite length field to new SPS size.
152           if (h264_header.packetization_type == kH264StapA) {
153             size_t length_field_offset =
154                 start_offset - (H264::kNaluTypeSize + kLengthFieldSize);
155             // Stap-A Length includes payload data and type header.
156             size_t rewritten_size =
157                 output_buffer.size() - start_offset + H264::kNaluTypeSize;
158             ByteWriter<uint16_t>::WriteBigEndian(
159                 &output_buffer[length_field_offset], rewritten_size);
160           }
161 
162           parsed_payload->video_payload.SetData(output_buffer.data(),
163                                                 output_buffer.size());
164           // Append rest of packet.
165           parsed_payload->video_payload.AppendData(
166               &payload_data[end_offset],
167               nalu_length + kNalHeaderSize - end_offset);
168 
169           modified_buffer = true;
170         }
171 
172         if (sps) {
173           parsed_payload->video_header.width = sps->width;
174           parsed_payload->video_header.height = sps->height;
175           nalu.sps_id = sps->id;
176         } else {
177           RTC_LOG(LS_WARNING) << "Failed to parse SPS id from SPS slice.";
178         }
179         parsed_payload->video_header.frame_type =
180             VideoFrameType::kVideoFrameKey;
181         break;
182       }
183       case H264::NaluType::kPps: {
184         uint32_t pps_id;
185         uint32_t sps_id;
186         if (PpsParser::ParsePpsIds(&payload_data[start_offset],
187                                    end_offset - start_offset, &pps_id,
188                                    &sps_id)) {
189           nalu.pps_id = pps_id;
190           nalu.sps_id = sps_id;
191         } else {
192           RTC_LOG(LS_WARNING)
193               << "Failed to parse PPS id and SPS id from PPS slice.";
194         }
195         break;
196       }
197       case H264::NaluType::kIdr:
198         parsed_payload->video_header.frame_type =
199             VideoFrameType::kVideoFrameKey;
200         ABSL_FALLTHROUGH_INTENDED;
201       case H264::NaluType::kSlice: {
202         absl::optional<uint32_t> pps_id = PpsParser::ParsePpsIdFromSlice(
203             &payload_data[start_offset], end_offset - start_offset);
204         if (pps_id) {
205           nalu.pps_id = *pps_id;
206         } else {
207           RTC_LOG(LS_WARNING) << "Failed to parse PPS id from slice of type: "
208                               << static_cast<int>(nalu.type);
209         }
210         break;
211       }
212       // Slices below don't contain SPS or PPS ids.
213       case H264::NaluType::kAud:
214       case H264::NaluType::kEndOfSequence:
215       case H264::NaluType::kEndOfStream:
216       case H264::NaluType::kFiller:
217       case H264::NaluType::kSei:
218         break;
219       case H264::NaluType::kStapA:
220       case H264::NaluType::kFuA:
221         RTC_LOG(LS_WARNING) << "Unexpected STAP-A or FU-A received.";
222         return absl::nullopt;
223     }
224 
225     if (h264_header.nalus_length == kMaxNalusPerPacket) {
226       RTC_LOG(LS_WARNING)
227           << "Received packet containing more than " << kMaxNalusPerPacket
228           << " NAL units. Will not keep track sps and pps ids for all of them.";
229     } else {
230       h264_header.nalus[h264_header.nalus_length++] = nalu;
231     }
232   }
233 
234   return parsed_payload;
235 }
236 
ParseFuaNalu(rtc::CopyOnWriteBuffer rtp_payload)237 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> ParseFuaNalu(
238     rtc::CopyOnWriteBuffer rtp_payload) {
239   if (rtp_payload.size() < kFuAHeaderSize) {
240     RTC_LOG(LS_ERROR) << "FU-A NAL units truncated.";
241     return absl::nullopt;
242   }
243   absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload(
244       absl::in_place);
245   uint8_t fnri = rtp_payload.cdata()[0] & (kFBit | kNriMask);
246   uint8_t original_nal_type = rtp_payload.cdata()[1] & kTypeMask;
247   bool first_fragment = (rtp_payload.cdata()[1] & kSBit) > 0;
248   NaluInfo nalu;
249   nalu.type = original_nal_type;
250   nalu.sps_id = -1;
251   nalu.pps_id = -1;
252   if (first_fragment) {
253     absl::optional<uint32_t> pps_id =
254         PpsParser::ParsePpsIdFromSlice(rtp_payload.cdata() + 2 * kNalHeaderSize,
255                                        rtp_payload.size() - 2 * kNalHeaderSize);
256     if (pps_id) {
257       nalu.pps_id = *pps_id;
258     } else {
259       RTC_LOG(LS_WARNING)
260           << "Failed to parse PPS from first fragment of FU-A NAL "
261              "unit with original type: "
262           << static_cast<int>(nalu.type);
263     }
264     uint8_t original_nal_header = fnri | original_nal_type;
265     rtp_payload =
266         rtp_payload.Slice(kNalHeaderSize, rtp_payload.size() - kNalHeaderSize);
267     rtp_payload[0] = original_nal_header;
268     parsed_payload->video_payload = std::move(rtp_payload);
269   } else {
270     parsed_payload->video_payload =
271         rtp_payload.Slice(kFuAHeaderSize, rtp_payload.size() - kFuAHeaderSize);
272   }
273 
274   if (original_nal_type == H264::NaluType::kIdr) {
275     parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameKey;
276   } else {
277     parsed_payload->video_header.frame_type = VideoFrameType::kVideoFrameDelta;
278   }
279   parsed_payload->video_header.width = 0;
280   parsed_payload->video_header.height = 0;
281   parsed_payload->video_header.codec = kVideoCodecH264;
282   parsed_payload->video_header.simulcastIdx = 0;
283   parsed_payload->video_header.is_first_packet_in_frame = first_fragment;
284   auto& h264_header = parsed_payload->video_header.video_type_header
285                           .emplace<RTPVideoHeaderH264>();
286   h264_header.packetization_type = kH264FuA;
287   h264_header.nalu_type = original_nal_type;
288   if (first_fragment) {
289     h264_header.nalus[h264_header.nalus_length] = nalu;
290     h264_header.nalus_length = 1;
291   }
292   return parsed_payload;
293 }
294 
295 }  // namespace
296 
297 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload>
Parse(rtc::CopyOnWriteBuffer rtp_payload)298 VideoRtpDepacketizerH264::Parse(rtc::CopyOnWriteBuffer rtp_payload) {
299   if (rtp_payload.size() == 0) {
300     RTC_LOG(LS_ERROR) << "Empty payload.";
301     return absl::nullopt;
302   }
303 
304   uint8_t nal_type = rtp_payload.cdata()[0] & kTypeMask;
305 
306   if (nal_type == H264::NaluType::kFuA) {
307     // Fragmented NAL units (FU-A).
308     return ParseFuaNalu(std::move(rtp_payload));
309   } else {
310     // We handle STAP-A and single NALU's the same way here. The jitter buffer
311     // will depacketize the STAP-A into NAL units later.
312     return ProcessStapAOrSingleNalu(std::move(rtp_payload));
313   }
314 }
315 
316 }  // namespace webrtc
317