1 /* 2 * Copyright (c) 2019 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VP9_SIMPLE_ENCODE_H_ 12 #define VPX_VP9_SIMPLE_ENCODE_H_ 13 14 #include <cstddef> 15 #include <cstdint> 16 #include <cstdio> 17 #include <memory> 18 #include <vector> 19 20 namespace vp9 { 21 22 enum StatusCode { 23 StatusOk = 0, 24 StatusError, 25 }; 26 27 // TODO(angiebird): Add description for each frame type. 28 enum FrameType { 29 kFrameTypeKey = 0, 30 kFrameTypeInter = 1, 31 kFrameTypeAltRef = 2, 32 kFrameTypeOverlay = 3, 33 kFrameTypeGolden = 4, 34 }; 35 36 // TODO(angiebird): Add description for each reference frame type. 37 // This enum numbers have to be contiguous and start from zero except 38 // kNoneRefFrame. 39 enum RefFrameType { 40 kRefFrameTypeLast = 0, 41 kRefFrameTypePast = 1, 42 kRefFrameTypeFuture = 2, 43 kRefFrameTypeMax = 3, 44 kRefFrameTypeNone = -1, 45 }; 46 47 enum GopMapFlag { 48 kGopMapFlagStart = 49 1 << 0, // Indicate this location is the start of a group of pictures. 50 kGopMapFlagUseAltRef = 51 1 << 1, // Indicate this group of pictures will use an alt ref. Only set 52 // this flag when kGopMapFlagStart is set. 53 }; 54 55 // The frame is split to 4x4 blocks. 56 // This structure contains the information of each 4x4 block. 57 struct PartitionInfo { 58 int row; // row pixel offset of current 4x4 block 59 int column; // column pixel offset of current 4x4 block 60 int row_start; // row pixel offset of the start of the prediction block 61 int column_start; // column pixel offset of the start of the prediction block 62 int width; // prediction block width 63 int height; // prediction block height 64 }; 65 66 constexpr int kMotionVectorSubPixelPrecision = 8; 67 constexpr int kMotionVectorFullPixelPrecision = 1; 68 69 // In the first pass. The frame is split to 16x16 blocks. 70 // This structure contains the information of each 16x16 block. 71 // In the second pass. The frame is split to 4x4 blocks. 72 // This structure contains the information of each 4x4 block. 73 struct MotionVectorInfo { 74 // Number of valid motion vectors, always 0 if this block is in the key frame. 75 // For inter frames, it could be 1 or 2. 76 int mv_count; 77 // The reference frame for motion vectors. If the second motion vector does 78 // not exist (mv_count = 1), the reference frame is kNoneRefFrame. 79 // Otherwise, the reference frame is either kRefFrameTypeLast, or 80 // kRefFrameTypePast, or kRefFrameTypeFuture. 81 RefFrameType ref_frame[2]; 82 // The row offset of motion vectors in the unit of pixel. 83 // If the second motion vector does not exist, the value is 0. 84 double mv_row[2]; 85 // The column offset of motion vectors in the unit of pixel. 86 // If the second motion vector does not exist, the value is 0. 87 double mv_column[2]; 88 }; 89 90 struct RefFrameInfo { 91 int coding_indexes[kRefFrameTypeMax]; 92 93 // Indicate whether the reference frames are available or not. 94 // When the reference frame type is not valid, it means either the to-be-coded 95 // frame is a key frame or the reference frame already appears in other 96 // reference frame type. vp9 always keeps three types of reference frame 97 // available. However, the duplicated reference frames will not be 98 // chosen by the encoder. The priorities of choosing reference frames are 99 // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture. 100 // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the 101 // same frame, kRefFrameTypePast will be set to invalid. 102 // 1: the ref frame type is available 0: the ref frame type is not available 103 int valid_list[kRefFrameTypeMax]; 104 }; 105 106 bool operator==(const RefFrameInfo &a, const RefFrameInfo &b); 107 108 struct EncodeFrameInfo { 109 int show_idx; 110 111 // Each show or no show frame is assigned with a coding index based on its 112 // coding order (starting from zero) in the coding process of the entire 113 // video. The coding index for each frame is unique. 114 int coding_index; 115 RefFrameInfo ref_frame_info; 116 FrameType frame_type; 117 }; 118 119 // This structure is a copy of vp9 |nmv_component_counts|. 120 struct NewMotionvectorComponentCounts { 121 std::vector<unsigned int> sign; 122 std::vector<unsigned int> classes; 123 std::vector<unsigned int> class0; 124 std::vector<std::vector<unsigned int>> bits; 125 std::vector<std::vector<unsigned int>> class0_fp; 126 std::vector<unsigned int> fp; 127 std::vector<unsigned int> class0_hp; 128 std::vector<unsigned int> hp; 129 }; 130 131 // This structure is a copy of vp9 |nmv_context_counts|. 132 struct NewMotionVectorContextCounts { 133 std::vector<unsigned int> joints; 134 std::vector<NewMotionvectorComponentCounts> comps; 135 }; 136 137 using UintArray2D = std::vector<std::vector<unsigned int>>; 138 using UintArray3D = std::vector<std::vector<std::vector<unsigned int>>>; 139 using UintArray5D = std::vector< 140 std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>; 141 using UintArray6D = std::vector<std::vector< 142 std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>>; 143 144 // This structure is a copy of vp9 |tx_counts|. 145 struct TransformSizeCounts { 146 // Transform size found in blocks of partition size 32x32. 147 // First dimension: transform size contexts (2). 148 // Second dimension: transform size type (3: 32x32, 16x16, 8x8) 149 UintArray2D p32x32; 150 // Transform size found in blocks of partition size 16x16. 151 // First dimension: transform size contexts (2). 152 // Second dimension: transform size type (2: 16x16, 8x8) 153 UintArray2D p16x16; 154 // Transform size found in blocks of partition size 8x8. 155 // First dimension: transform size contexts (2). 156 // Second dimension: transform size type (1: 8x8) 157 UintArray2D p8x8; 158 // Overall transform size count. 159 std::vector<unsigned int> tx_totals; 160 }; 161 162 // This structure is a copy of vp9 |FRAME_COUNTS|. 163 struct FrameCounts { 164 // Intra prediction mode for luma plane. First dimension: block size (4). 165 // Second dimension: intra prediction mode (10). 166 UintArray2D y_mode; 167 // Intra prediction mode for chroma plane. First and second dimension: 168 // intra prediction mode (10). 169 UintArray2D uv_mode; 170 // Partition type. First dimension: partition contexts (16). 171 // Second dimension: partition type (4). 172 UintArray2D partition; 173 // Transform coefficient. 174 UintArray6D coef; 175 // End of block (the position of the last non-zero transform coefficient) 176 UintArray5D eob_branch; 177 // Interpolation filter type. First dimension: switchable filter contexts (4). 178 // Second dimension: filter types (3). 179 UintArray2D switchable_interp; 180 // Inter prediction mode (the motion vector type). 181 // First dimension: inter mode contexts (7). 182 // Second dimension: mode type (4). 183 UintArray2D inter_mode; 184 // Block is intra or inter predicted. First dimension: contexts (4). 185 // Second dimension: type (0 for intra, 1 for inter). 186 UintArray2D intra_inter; 187 // Block is compound predicted (predicted from average of two blocks). 188 // First dimension: contexts (5). 189 // Second dimension: type (0 for single, 1 for compound prediction). 190 UintArray2D comp_inter; 191 // Type of the reference frame. Only one reference frame. 192 // First dimension: context (5). Second dimension: context (2). 193 // Third dimension: count (2). 194 UintArray3D single_ref; 195 // Type of the two reference frames. 196 // First dimension: context (5). Second dimension: count (2). 197 UintArray2D comp_ref; 198 // Block skips transform and quantization, uses prediction as reconstruction. 199 // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip). 200 UintArray2D skip; 201 // Transform size. 202 TransformSizeCounts tx; 203 // New motion vector. 204 NewMotionVectorContextCounts mv; 205 }; 206 207 struct ImageBuffer { 208 // The image data is stored in raster order, 209 // i.e. image[plane][r][c] = 210 // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]]. 211 std::unique_ptr<unsigned char[]> plane_buffer[3]; 212 int plane_width[3]; 213 int plane_height[3]; 214 }; 215 216 void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file); 217 218 struct EncodeFrameResult { 219 int show_idx; 220 FrameType frame_type; 221 int coding_idx; 222 RefFrameInfo ref_frame_info; 223 size_t coding_data_bit_size; 224 size_t coding_data_byte_size; 225 // The EncodeFrame will allocate a buffer, write the coding data into the 226 // buffer and give the ownership of the buffer to coding_data. 227 std::unique_ptr<unsigned char[]> coding_data; 228 double psnr; 229 uint64_t sse; 230 int quantize_index; 231 FrameCounts frame_counts; 232 int num_rows_4x4; // number of row units, in size of 4. 233 int num_cols_4x4; // number of column units, in size of 4. 234 // A vector of the partition information of the frame. 235 // The number of elements is |num_rows_4x4| * |num_cols_4x4|. 236 // The frame is divided 4x4 blocks of |num_rows_4x4| rows and 237 // |num_cols_4x4| columns. 238 // Each 4x4 block contains the current pixel position (|row|, |column|), 239 // the start pixel position of the partition (|row_start|, |column_start|), 240 // and the |width|, |height| of the partition. 241 // The current pixel position can be the same as the start pixel position 242 // if the 4x4 block is the top-left block in the partition. Otherwise, they 243 // are different. 244 // Within the same partition, all 4x4 blocks have the same |row_start|, 245 // |column_start|, |width| and |height|. 246 // For example, if the frame is partitioned to a 32x32 block, 247 // starting at (0, 0). Then, there're 64 4x4 blocks within this partition. 248 // They all have the same |row_start|, |column_start|, |width|, |height|, 249 // which can be used to figure out the start of the current partition and 250 // the start of the next partition block. 251 // Horizontal next: |column_start| + |width|, 252 // Vertical next: |row_start| + |height|. 253 std::vector<PartitionInfo> partition_info; 254 // A vector of the motion vector information of the frame. 255 // The number of elements is |num_rows_4x4| * |num_cols_4x4|. 256 // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and 257 // |num_cols_4x4| columns. 258 // Each 4x4 block contains 0 motion vector if this is an intra predicted 259 // frame (for example, the key frame). If the frame is inter predicted, 260 // each 4x4 block contains either 1 or 2 motion vectors. 261 // Similar to partition info, all 4x4 blocks inside the same partition block 262 // share the same motion vector information. 263 std::vector<MotionVectorInfo> motion_vector_info; 264 ImageBuffer coded_frame; 265 266 // recode_count, q_index_history and rate_history are only available when 267 // EncodeFrameWithTargetFrameBits() is used. 268 int recode_count; 269 std::vector<int> q_index_history; 270 std::vector<int> rate_history; 271 }; 272 273 struct GroupOfPicture { 274 // This list will be updated internally in StartEncode() and 275 // EncodeFrame()/EncodeFrameWithQuantizeIndex(). 276 // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be 277 // triggered when the coded frame is the last one in the previous group of 278 // pictures. 279 std::vector<EncodeFrameInfo> encode_frame_list; 280 281 // Indicates the index of the next coding frame in encode_frame_list. 282 // In other words, EncodeFrameInfo of the next coding frame can be 283 // obtained with encode_frame_list[next_encode_frame_index]. 284 // Internally, next_encode_frame_index will be set to zero after the last 285 // frame of the group of pictures is coded. Otherwise, next_encode_frame_index 286 // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex() 287 // call. 288 int next_encode_frame_index; 289 290 // Number of show frames in this group of pictures. 291 int show_frame_count; 292 293 // The show index/timestamp of the earliest show frame in the group of 294 // pictures. 295 int start_show_index; 296 297 // The coding index of the first coding frame in the group of pictures. 298 int start_coding_index; 299 300 // Indicates whether this group of pictures starts with a key frame. 301 int first_is_key_frame; 302 303 // Indicates whether this group of pictures uses an alt ref. 304 int use_alt_ref; 305 306 // Indicates whether previous group of pictures used an alt ref. 307 int last_gop_use_alt_ref; 308 }; 309 310 class SimpleEncode { 311 public: 312 // When outfile_path is set, the encoder will output the bitstream in ivf 313 // format. 314 SimpleEncode(int frame_width, int frame_height, int frame_rate_num, 315 int frame_rate_den, int target_bitrate, int num_frames, 316 const char *infile_path, const char *outfile_path = nullptr); 317 ~SimpleEncode(); 318 SimpleEncode(SimpleEncode &) = delete; 319 SimpleEncode &operator=(const SimpleEncode &) = delete; 320 321 // Adjusts the encoder's coding speed. 322 // If this function is not called, the encoder will use default encode_speed 323 // 0. Call this function before ComputeFirstPassStats() if needed. 324 // The encode_speed is equivalent to --cpu-used of the vpxenc command. 325 // The encode_speed's range should be [0, 9]. 326 // Setting the encode_speed to a higher level will yield faster coding 327 // at the cost of lower compression efficiency. 328 void SetEncodeSpeed(int encode_speed); 329 330 // Set encoder config 331 // The following configs in VP9EncoderConfig are allowed to change in this 332 // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each 333 // config's meaning. 334 // Configs in VP9EncoderConfig: Equivalent configs in ffmpeg: 335 // 1 key_freq -g 336 // 2 two_pass_vbrmin_section -minrate * 100LL / bit_rate 337 // 3 two_pass_vbrmax_section -maxrate * 100LL / bit_rate 338 // 4 under_shoot_pct -undershoot-pct 339 // 5 over_shoot_pct -overshoot-pct 340 // 6 max_threads -threads 341 // 7 frame_parallel_decoding_mode -frame-parallel 342 // 8 tile_column -tile-columns 343 // 9 arnr_max_frames -arnr-maxframes 344 // 10 arnr_strength -arnr-strength 345 // 11 lag_in_frames -rc_lookahead 346 // 12 encode_breakout -static-thresh 347 // 13 enable_tpl_model -enable-tpl 348 // 14 enable_auto_arf -auto-alt-ref 349 StatusCode SetEncodeConfig(const char *name, const char *value); 350 351 // A debug function that dumps configs from VP9EncoderConfig 352 // pass = 1: first pass, pass = 2: second pass 353 // fp: file pointer for dumping config 354 StatusCode DumpEncodeConfigs(int pass, FILE *fp); 355 356 // Makes encoder compute the first pass stats and store it at 357 // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the 358 // first pass stats. 359 void ComputeFirstPassStats(); 360 361 // Outputs the first pass stats represented by a 2-D vector. 362 // One can use the frame index at first dimension to retrieve the stats for 363 // each video frame. The stats of each video frame is a vector of 25 double 364 // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h 365 std::vector<std::vector<double>> ObserveFirstPassStats(); 366 367 // Outputs the first pass motion vectors represented by a 2-D vector. 368 // One can use the frame index at first dimension to retrieve the mvs for 369 // each video frame. The frame is divided into 16x16 blocks. The number of 370 // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). 371 std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors(); 372 373 // Ouputs a copy of key_frame_map_, a binary vector with size equal to the 374 // number of show frames in the video. For each entry in the vector, 1 375 // indicates the position is a key frame and 0 indicates it's not a key frame. 376 // This function should be called after ComputeFirstPassStats() 377 std::vector<int> ObserveKeyFrameMap() const; 378 379 // Sets group of pictures map for coding the entire video. 380 // Each entry in the gop_map corresponds to a show frame in the video. 381 // Therefore, the size of gop_map should equal to the number of show frames in 382 // the entire video. 383 // If a given entry's kGopMapFlagStart is set, it means this is the start of a 384 // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to 385 // indicate whether this gop use altref. 386 // If a given entry is zero, it means it's in the middle of a gop. 387 // This function should be called only once after ComputeFirstPassStats(), 388 // before StartEncode(). 389 // This API will check and modify the gop_map to satisfy the following 390 // constraints. 391 // 1) Each key frame position should be at the start of a gop. 392 // 2) The last gop should not use an alt ref. 393 void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size); 394 395 // Observe the group of pictures map set through 396 // SetExternalGroupOfPicturesMap(). This function should be called after 397 // SetExternalGroupOfPicturesMap(). 398 std::vector<int> ObserveExternalGroupOfPicturesMap(); 399 400 // Initializes the encoder for actual encoding. 401 // This function should be called after ComputeFirstPassStats(). 402 void StartEncode(); 403 404 // Frees the encoder. 405 // This function should be called after StartEncode() or EncodeFrame(). 406 void EndEncode(); 407 408 // The key frame group size includes one key frame plus the number of 409 // following inter frames. Note that the key frame group size only counts the 410 // show frames. The number of no show frames like alternate refereces are not 411 // counted. 412 int GetKeyFrameGroupSize() const; 413 414 // Provides the group of pictures that the next coding frame is in. 415 // Only call this function between StartEncode() and EndEncode() 416 GroupOfPicture ObserveGroupOfPicture() const; 417 418 // Gets encode_frame_info for the next coding frame. 419 // Only call this function between StartEncode() and EndEncode() 420 EncodeFrameInfo GetNextEncodeFrameInfo() const; 421 422 // Encodes a frame 423 // This function should be called after StartEncode() and before EndEncode(). 424 void EncodeFrame(EncodeFrameResult *encode_frame_result); 425 426 // Encodes a frame with a specific quantize index. 427 // This function should be called after StartEncode() and before EndEncode(). 428 void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result, 429 int quantize_index); 430 431 // Encode a frame with target frame bits usage. 432 // The encoder will find a quantize index to make the actual frame bits usage 433 // match the target. EncodeFrameWithTargetFrameBits() will recode the frame 434 // up to 7 times to find a q_index to make the actual_frame_bits satisfy the 435 // following inequality. |actual_frame_bits - target_frame_bits| * 100 / 436 // target_frame_bits 437 // <= percent_diff. 438 void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result, 439 int target_frame_bits, 440 double percent_diff); 441 442 // Gets the number of coding frames for the video. The coding frames include 443 // show frame and no show frame. 444 // This function should be called after ComputeFirstPassStats(). 445 int GetCodingFrameNum() const; 446 447 // Gets the total number of pixels of YUV planes per frame. 448 uint64_t GetFramePixelCount() const; 449 450 private: 451 // Compute the key frame locations of the video based on first pass stats. 452 // The results are returned as a binary vector with 1s indicating keyframes 453 // and 0s indicating non keyframes. 454 // It has to be called after impl_ptr_->first_pass_stats is computed. 455 std::vector<int> ComputeKeyFrameMap() const; 456 457 // Updates key_frame_group_size_, reset key_frame_group_index_ and init 458 // ref_frame_info_. 459 void UpdateKeyFrameGroup(int key_frame_show_index); 460 461 // Update key_frame_group_index_. 462 void PostUpdateKeyFrameGroupIndex(FrameType frame_type); 463 464 void PostUpdateState(const EncodeFrameResult &encode_frame_result); 465 466 class EncodeImpl; 467 468 int frame_width_; // frame width in pixels. 469 int frame_height_; // frame height in pixels. 470 int frame_rate_num_; 471 int frame_rate_den_; 472 int target_bitrate_; 473 int num_frames_; 474 int encode_speed_; 475 476 std::FILE *in_file_; 477 std::FILE *out_file_; 478 std::unique_ptr<EncodeImpl> impl_ptr_; 479 480 std::vector<int> key_frame_map_; 481 std::vector<int> gop_map_; 482 GroupOfPicture group_of_picture_; 483 484 // The key frame group size includes one key frame plus the number of 485 // following inter frames. Note that the key frame group size only counts the 486 // show frames. The number of no show frames like alternate references are not 487 // counted. 488 int key_frame_group_size_; 489 490 // The index for the to-be-coded show frame in the key frame group. 491 int key_frame_group_index_; 492 493 // Each show or no show frame is assigned with a coding index based on its 494 // coding order (starting from zero) in the coding process of the entire 495 // video. The coding index of the to-be-coded frame. 496 int frame_coding_index_; 497 498 // Number of show frames we have coded so far. 499 int show_frame_count_; 500 501 // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key 502 // frame appears? 503 // Reference frames info of the to-be-coded frame. 504 RefFrameInfo ref_frame_info_; 505 506 // A 2-D vector of motion vector information of the frame collected 507 // from the first pass. The first dimension is the frame index. 508 // Each frame is divided into 16x16 blocks. The number of elements is 509 // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4). 510 // Each 16x16 block contains 0 motion vector if this is an intra predicted 511 // frame (for example, the key frame). If the frame is inter predicted, 512 // each 16x16 block contains either 1 or 2 motion vectors. 513 // The first motion vector is always from the LAST_FRAME. 514 // The second motion vector is always from the GOLDEN_FRAME. 515 std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_; 516 }; 517 518 } // namespace vp9 519 520 #endif // VPX_VP9_SIMPLE_ENCODE_H_ 521