1 /*
2 * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h"
12
13 #include <string.h>
14
15 #include "api/video/video_codec_constants.h"
16 #include "modules/rtp_rtcp/source/rtp_packet_to_send.h"
17 #include "modules/video_coding/codecs/interface/common_constants.h"
18 #include "rtc_base/bit_buffer.h"
19 #include "rtc_base/checks.h"
20 #include "rtc_base/logging.h"
21
22 #define RETURN_FALSE_ON_ERROR(x) \
23 if (!(x)) { \
24 return false; \
25 }
26
27 namespace webrtc {
28 namespace {
29
30 constexpr int kFailedToParse = 0;
31
32 // Picture ID:
33 //
34 // +-+-+-+-+-+-+-+-+
35 // I: |M| PICTURE ID | M:0 => picture id is 7 bits.
36 // +-+-+-+-+-+-+-+-+ M:1 => picture id is 15 bits.
37 // M: | EXTENDED PID |
38 // +-+-+-+-+-+-+-+-+
39 //
ParsePictureId(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)40 bool ParsePictureId(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
41 uint32_t picture_id;
42 uint32_t m_bit;
43 RETURN_FALSE_ON_ERROR(parser->ReadBits(&m_bit, 1));
44 if (m_bit) {
45 RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 15));
46 vp9->max_picture_id = kMaxTwoBytePictureId;
47 } else {
48 RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 7));
49 vp9->max_picture_id = kMaxOneBytePictureId;
50 }
51 vp9->picture_id = picture_id;
52 return true;
53 }
54
55 // Layer indices (flexible mode):
56 //
57 // +-+-+-+-+-+-+-+-+
58 // L: | T |U| S |D|
59 // +-+-+-+-+-+-+-+-+
60 //
ParseLayerInfoCommon(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)61 bool ParseLayerInfoCommon(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
62 uint32_t t, u_bit, s, d_bit;
63 RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3));
64 RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1));
65 RETURN_FALSE_ON_ERROR(parser->ReadBits(&s, 3));
66 RETURN_FALSE_ON_ERROR(parser->ReadBits(&d_bit, 1));
67 vp9->temporal_idx = t;
68 vp9->temporal_up_switch = u_bit ? true : false;
69 if (s >= kMaxSpatialLayers)
70 return false;
71 vp9->spatial_idx = s;
72 vp9->inter_layer_predicted = d_bit ? true : false;
73 return true;
74 }
75
76 // Layer indices (non-flexible mode):
77 //
78 // +-+-+-+-+-+-+-+-+
79 // L: | T |U| S |D|
80 // +-+-+-+-+-+-+-+-+
81 // | TL0PICIDX |
82 // +-+-+-+-+-+-+-+-+
83 //
ParseLayerInfoNonFlexibleMode(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)84 bool ParseLayerInfoNonFlexibleMode(rtc::BitBuffer* parser,
85 RTPVideoHeaderVP9* vp9) {
86 uint8_t tl0picidx;
87 RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&tl0picidx));
88 vp9->tl0_pic_idx = tl0picidx;
89 return true;
90 }
91
ParseLayerInfo(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)92 bool ParseLayerInfo(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
93 if (!ParseLayerInfoCommon(parser, vp9))
94 return false;
95
96 if (vp9->flexible_mode)
97 return true;
98
99 return ParseLayerInfoNonFlexibleMode(parser, vp9);
100 }
101
102 // Reference indices:
103 //
104 // +-+-+-+-+-+-+-+-+ P=1,F=1: At least one reference index
105 // P,F: | P_DIFF |N| up to 3 times has to be specified.
106 // +-+-+-+-+-+-+-+-+ N=1: An additional P_DIFF follows
107 // current P_DIFF.
108 //
ParseRefIndices(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)109 bool ParseRefIndices(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
110 if (vp9->picture_id == kNoPictureId)
111 return false;
112
113 vp9->num_ref_pics = 0;
114 uint32_t n_bit;
115 do {
116 if (vp9->num_ref_pics == kMaxVp9RefPics)
117 return false;
118
119 uint32_t p_diff;
120 RETURN_FALSE_ON_ERROR(parser->ReadBits(&p_diff, 7));
121 RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_bit, 1));
122
123 vp9->pid_diff[vp9->num_ref_pics] = p_diff;
124 uint32_t scaled_pid = vp9->picture_id;
125 if (p_diff > scaled_pid) {
126 // TODO(asapersson): Max should correspond to the picture id of last wrap.
127 scaled_pid += vp9->max_picture_id + 1;
128 }
129 vp9->ref_picture_id[vp9->num_ref_pics++] = scaled_pid - p_diff;
130 } while (n_bit);
131
132 return true;
133 }
134
135 // Scalability structure (SS).
136 //
137 // +-+-+-+-+-+-+-+-+
138 // V: | N_S |Y|G|-|-|-|
139 // +-+-+-+-+-+-+-+-+ -|
140 // Y: | WIDTH | (OPTIONAL) .
141 // + + .
142 // | | (OPTIONAL) .
143 // +-+-+-+-+-+-+-+-+ . N_S + 1 times
144 // | HEIGHT | (OPTIONAL) .
145 // + + .
146 // | | (OPTIONAL) .
147 // +-+-+-+-+-+-+-+-+ -|
148 // G: | N_G | (OPTIONAL)
149 // +-+-+-+-+-+-+-+-+ -|
150 // N_G: | T |U| R |-|-| (OPTIONAL) .
151 // +-+-+-+-+-+-+-+-+ -| . N_G times
152 // | P_DIFF | (OPTIONAL) . R times .
153 // +-+-+-+-+-+-+-+-+ -| -|
154 //
ParseSsData(rtc::BitBuffer * parser,RTPVideoHeaderVP9 * vp9)155 bool ParseSsData(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
156 uint32_t n_s, y_bit, g_bit;
157 RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_s, 3));
158 RETURN_FALSE_ON_ERROR(parser->ReadBits(&y_bit, 1));
159 RETURN_FALSE_ON_ERROR(parser->ReadBits(&g_bit, 1));
160 RETURN_FALSE_ON_ERROR(parser->ConsumeBits(3));
161 vp9->num_spatial_layers = n_s + 1;
162 vp9->spatial_layer_resolution_present = y_bit ? true : false;
163 vp9->gof.num_frames_in_gof = 0;
164
165 if (y_bit) {
166 for (size_t i = 0; i < vp9->num_spatial_layers; ++i) {
167 RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->width[i]));
168 RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->height[i]));
169 }
170 }
171 if (g_bit) {
172 uint8_t n_g;
173 RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&n_g));
174 vp9->gof.num_frames_in_gof = n_g;
175 }
176 for (size_t i = 0; i < vp9->gof.num_frames_in_gof; ++i) {
177 uint32_t t, u_bit, r;
178 RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3));
179 RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1));
180 RETURN_FALSE_ON_ERROR(parser->ReadBits(&r, 2));
181 RETURN_FALSE_ON_ERROR(parser->ConsumeBits(2));
182 vp9->gof.temporal_idx[i] = t;
183 vp9->gof.temporal_up_switch[i] = u_bit ? true : false;
184 vp9->gof.num_ref_pics[i] = r;
185
186 for (uint8_t p = 0; p < vp9->gof.num_ref_pics[i]; ++p) {
187 uint8_t p_diff;
188 RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&p_diff));
189 vp9->gof.pid_diff[i][p] = p_diff;
190 }
191 }
192 return true;
193 }
194 } // namespace
195
196 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload>
Parse(rtc::CopyOnWriteBuffer rtp_payload)197 VideoRtpDepacketizerVp9::Parse(rtc::CopyOnWriteBuffer rtp_payload) {
198 rtc::ArrayView<const uint8_t> payload(rtp_payload.cdata(),
199 rtp_payload.size());
200 absl::optional<ParsedRtpPayload> result(absl::in_place);
201 int offset = ParseRtpPayload(payload, &result->video_header);
202 if (offset == kFailedToParse)
203 return absl::nullopt;
204 RTC_DCHECK_LT(offset, rtp_payload.size());
205 result->video_payload =
206 rtp_payload.Slice(offset, rtp_payload.size() - offset);
207 return result;
208 }
209
ParseRtpPayload(rtc::ArrayView<const uint8_t> rtp_payload,RTPVideoHeader * video_header)210 int VideoRtpDepacketizerVp9::ParseRtpPayload(
211 rtc::ArrayView<const uint8_t> rtp_payload,
212 RTPVideoHeader* video_header) {
213 RTC_DCHECK(video_header);
214 // Parse mandatory first byte of payload descriptor.
215 rtc::BitBuffer parser(rtp_payload.data(), rtp_payload.size());
216 uint8_t first_byte;
217 if (!parser.ReadUInt8(&first_byte)) {
218 RTC_LOG(LS_ERROR) << "Payload length is zero.";
219 return kFailedToParse;
220 }
221 bool i_bit = first_byte & 0b1000'0000; // PictureId present .
222 bool p_bit = first_byte & 0b0100'0000; // Inter-picture predicted.
223 bool l_bit = first_byte & 0b0010'0000; // Layer indices present.
224 bool f_bit = first_byte & 0b0001'0000; // Flexible mode.
225 bool b_bit = first_byte & 0b0000'1000; // Begins frame flag.
226 bool e_bit = first_byte & 0b0000'0100; // Ends frame flag.
227 bool v_bit = first_byte & 0b0000'0010; // Scalability structure present.
228 bool z_bit = first_byte & 0b0000'0001; // Not used for inter-layer prediction
229
230 // Parsed payload.
231 video_header->width = 0;
232 video_header->height = 0;
233 video_header->simulcastIdx = 0;
234 video_header->codec = kVideoCodecVP9;
235
236 video_header->frame_type =
237 p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey;
238
239 auto& vp9_header =
240 video_header->video_type_header.emplace<RTPVideoHeaderVP9>();
241 vp9_header.InitRTPVideoHeaderVP9();
242 vp9_header.inter_pic_predicted = p_bit;
243 vp9_header.flexible_mode = f_bit;
244 vp9_header.beginning_of_frame = b_bit;
245 vp9_header.end_of_frame = e_bit;
246 vp9_header.ss_data_available = v_bit;
247 vp9_header.non_ref_for_inter_layer_pred = z_bit;
248
249 // Parse fields that are present.
250 if (i_bit && !ParsePictureId(&parser, &vp9_header)) {
251 RTC_LOG(LS_ERROR) << "Failed parsing VP9 picture id.";
252 return kFailedToParse;
253 }
254 if (l_bit && !ParseLayerInfo(&parser, &vp9_header)) {
255 RTC_LOG(LS_ERROR) << "Failed parsing VP9 layer info.";
256 return kFailedToParse;
257 }
258 if (p_bit && f_bit && !ParseRefIndices(&parser, &vp9_header)) {
259 RTC_LOG(LS_ERROR) << "Failed parsing VP9 ref indices.";
260 return kFailedToParse;
261 }
262 if (v_bit) {
263 if (!ParseSsData(&parser, &vp9_header)) {
264 RTC_LOG(LS_ERROR) << "Failed parsing VP9 SS data.";
265 return kFailedToParse;
266 }
267 if (vp9_header.spatial_layer_resolution_present) {
268 // TODO(asapersson): Add support for spatial layers.
269 video_header->width = vp9_header.width[0];
270 video_header->height = vp9_header.height[0];
271 }
272 }
273 video_header->is_first_packet_in_frame =
274 b_bit && (!l_bit || !vp9_header.inter_layer_predicted);
275
276 size_t byte_offset;
277 size_t bit_offset;
278 parser.GetCurrentOffset(&byte_offset, &bit_offset);
279 RTC_DCHECK_EQ(bit_offset, 0);
280 if (byte_offset == rtp_payload.size()) {
281 // Empty vp9 payload data.
282 return kFailedToParse;
283 }
284
285 return byte_offset;
286 }
287 } // namespace webrtc
288