1 /*
2  *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VP9_SIMPLE_ENCODE_H_
12 #define VPX_VP9_SIMPLE_ENCODE_H_
13 
14 #include <cstddef>
15 #include <cstdint>
16 #include <cstdio>
17 #include <memory>
18 #include <vector>
19 
20 namespace vp9 {
21 
22 enum StatusCode {
23   StatusOk = 0,
24   StatusError,
25 };
26 
27 // TODO(angiebird): Add description for each frame type.
28 enum FrameType {
29   kFrameTypeKey = 0,
30   kFrameTypeInter = 1,
31   kFrameTypeAltRef = 2,
32   kFrameTypeOverlay = 3,
33   kFrameTypeGolden = 4,
34 };
35 
36 // TODO(angiebird): Add description for each reference frame type.
37 // This enum numbers have to be contiguous and start from zero except
38 // kNoneRefFrame.
39 enum RefFrameType {
40   kRefFrameTypeLast = 0,
41   kRefFrameTypePast = 1,
42   kRefFrameTypeFuture = 2,
43   kRefFrameTypeMax = 3,
44   kRefFrameTypeNone = -1,
45 };
46 
47 enum GopMapFlag {
48   kGopMapFlagStart =
49       1 << 0,  // Indicate this location is the start of a group of pictures.
50   kGopMapFlagUseAltRef =
51       1 << 1,  // Indicate this group of pictures will use an alt ref. Only set
52                // this flag when kGopMapFlagStart is set.
53 };
54 
55 // The frame is split to 4x4 blocks.
56 // This structure contains the information of each 4x4 block.
57 struct PartitionInfo {
58   int row;           // row pixel offset of current 4x4 block
59   int column;        // column pixel offset of current 4x4 block
60   int row_start;     // row pixel offset of the start of the prediction block
61   int column_start;  // column pixel offset of the start of the prediction block
62   int width;         // prediction block width
63   int height;        // prediction block height
64 };
65 
66 constexpr int kMotionVectorSubPixelPrecision = 8;
67 constexpr int kMotionVectorFullPixelPrecision = 1;
68 
69 // In the first pass. The frame is split to 16x16 blocks.
70 // This structure contains the information of each 16x16 block.
71 // In the second pass. The frame is split to 4x4 blocks.
72 // This structure contains the information of each 4x4 block.
73 struct MotionVectorInfo {
74   // Number of valid motion vectors, always 0 if this block is in the key frame.
75   // For inter frames, it could be 1 or 2.
76   int mv_count;
77   // The reference frame for motion vectors. If the second motion vector does
78   // not exist (mv_count = 1), the reference frame is kNoneRefFrame.
79   // Otherwise, the reference frame is either kRefFrameTypeLast, or
80   // kRefFrameTypePast, or kRefFrameTypeFuture.
81   RefFrameType ref_frame[2];
82   // The row offset of motion vectors in the unit of pixel.
83   // If the second motion vector does not exist, the value is 0.
84   double mv_row[2];
85   // The column offset of motion vectors in the unit of pixel.
86   // If the second motion vector does not exist, the value is 0.
87   double mv_column[2];
88 };
89 
90 struct RefFrameInfo {
91   int coding_indexes[kRefFrameTypeMax];
92 
93   // Indicate whether the reference frames are available or not.
94   // When the reference frame type is not valid, it means either the to-be-coded
95   // frame is a key frame or the reference frame already appears in other
96   // reference frame type. vp9 always keeps three types of reference frame
97   // available.  However, the duplicated reference frames will not be
98   // chosen by the encoder. The priorities of choosing reference frames are
99   // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture.
100   // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the
101   // same frame, kRefFrameTypePast will be set to invalid.
102   // 1: the ref frame type is available 0: the ref frame type is not available
103   int valid_list[kRefFrameTypeMax];
104 };
105 
106 bool operator==(const RefFrameInfo &a, const RefFrameInfo &b);
107 
108 struct EncodeFrameInfo {
109   int show_idx;
110 
111   // Each show or no show frame is assigned with a coding index based on its
112   // coding order (starting from zero) in the coding process of the entire
113   // video. The coding index for each frame is unique.
114   int coding_index;
115   RefFrameInfo ref_frame_info;
116   FrameType frame_type;
117 };
118 
119 // This structure is a copy of vp9 |nmv_component_counts|.
120 struct NewMotionvectorComponentCounts {
121   std::vector<unsigned int> sign;
122   std::vector<unsigned int> classes;
123   std::vector<unsigned int> class0;
124   std::vector<std::vector<unsigned int>> bits;
125   std::vector<std::vector<unsigned int>> class0_fp;
126   std::vector<unsigned int> fp;
127   std::vector<unsigned int> class0_hp;
128   std::vector<unsigned int> hp;
129 };
130 
131 // This structure is a copy of vp9 |nmv_context_counts|.
132 struct NewMotionVectorContextCounts {
133   std::vector<unsigned int> joints;
134   std::vector<NewMotionvectorComponentCounts> comps;
135 };
136 
137 using UintArray2D = std::vector<std::vector<unsigned int>>;
138 using UintArray3D = std::vector<std::vector<std::vector<unsigned int>>>;
139 using UintArray5D = std::vector<
140     std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>;
141 using UintArray6D = std::vector<std::vector<
142     std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>>;
143 
144 // This structure is a copy of vp9 |tx_counts|.
145 struct TransformSizeCounts {
146   // Transform size found in blocks of partition size 32x32.
147   // First dimension: transform size contexts (2).
148   // Second dimension: transform size type (3: 32x32, 16x16, 8x8)
149   UintArray2D p32x32;
150   // Transform size found in blocks of partition size 16x16.
151   // First dimension: transform size contexts (2).
152   // Second dimension: transform size type (2: 16x16, 8x8)
153   UintArray2D p16x16;
154   // Transform size found in blocks of partition size 8x8.
155   // First dimension: transform size contexts (2).
156   // Second dimension: transform size type (1: 8x8)
157   UintArray2D p8x8;
158   // Overall transform size count.
159   std::vector<unsigned int> tx_totals;
160 };
161 
162 // This structure is a copy of vp9 |FRAME_COUNTS|.
163 struct FrameCounts {
164   // Intra prediction mode for luma plane. First dimension: block size (4).
165   // Second dimension: intra prediction mode (10).
166   UintArray2D y_mode;
167   // Intra prediction mode for chroma plane. First and second dimension:
168   // intra prediction mode (10).
169   UintArray2D uv_mode;
170   // Partition type. First dimension: partition contexts (16).
171   // Second dimension: partition type (4).
172   UintArray2D partition;
173   // Transform coefficient.
174   UintArray6D coef;
175   // End of block (the position of the last non-zero transform coefficient)
176   UintArray5D eob_branch;
177   // Interpolation filter type. First dimension: switchable filter contexts (4).
178   // Second dimension: filter types (3).
179   UintArray2D switchable_interp;
180   // Inter prediction mode (the motion vector type).
181   // First dimension: inter mode contexts (7).
182   // Second dimension: mode type (4).
183   UintArray2D inter_mode;
184   // Block is intra or inter predicted. First dimension: contexts (4).
185   // Second dimension: type (0 for intra, 1 for inter).
186   UintArray2D intra_inter;
187   // Block is compound predicted (predicted from average of two blocks).
188   // First dimension: contexts (5).
189   // Second dimension: type (0 for single, 1 for compound prediction).
190   UintArray2D comp_inter;
191   // Type of the reference frame. Only one reference frame.
192   // First dimension: context (5). Second dimension: context (2).
193   // Third dimension: count (2).
194   UintArray3D single_ref;
195   // Type of the two reference frames.
196   // First dimension: context (5). Second dimension: count (2).
197   UintArray2D comp_ref;
198   // Block skips transform and quantization, uses prediction as reconstruction.
199   // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip).
200   UintArray2D skip;
201   // Transform size.
202   TransformSizeCounts tx;
203   // New motion vector.
204   NewMotionVectorContextCounts mv;
205 };
206 
207 struct ImageBuffer {
208   // The image data is stored in raster order,
209   // i.e. image[plane][r][c] =
210   // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]].
211   std::unique_ptr<unsigned char[]> plane_buffer[3];
212   int plane_width[3];
213   int plane_height[3];
214 };
215 
216 void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file);
217 
218 struct EncodeFrameResult {
219   int show_idx;
220   FrameType frame_type;
221   int coding_idx;
222   RefFrameInfo ref_frame_info;
223   size_t coding_data_bit_size;
224   size_t coding_data_byte_size;
225   // The EncodeFrame will allocate a buffer, write the coding data into the
226   // buffer and give the ownership of the buffer to coding_data.
227   std::unique_ptr<unsigned char[]> coding_data;
228   double psnr;
229   uint64_t sse;
230   int quantize_index;
231   FrameCounts frame_counts;
232   int num_rows_4x4;  // number of row units, in size of 4.
233   int num_cols_4x4;  // number of column units, in size of 4.
234   // A vector of the partition information of the frame.
235   // The number of elements is |num_rows_4x4| * |num_cols_4x4|.
236   // The frame is divided 4x4 blocks of |num_rows_4x4| rows and
237   // |num_cols_4x4| columns.
238   // Each 4x4 block contains the current pixel position (|row|, |column|),
239   // the start pixel position of the partition (|row_start|, |column_start|),
240   // and the |width|, |height| of the partition.
241   // The current pixel position can be the same as the start pixel position
242   // if the 4x4 block is the top-left block in the partition. Otherwise, they
243   // are different.
244   // Within the same partition, all 4x4 blocks have the same |row_start|,
245   // |column_start|, |width| and |height|.
246   // For example, if the frame is partitioned to a 32x32 block,
247   // starting at (0, 0). Then, there're 64 4x4 blocks within this partition.
248   // They all have the same |row_start|, |column_start|, |width|, |height|,
249   // which can be used to figure out the start of the current partition and
250   // the start of the next partition block.
251   // Horizontal next: |column_start| + |width|,
252   // Vertical next: |row_start| + |height|.
253   std::vector<PartitionInfo> partition_info;
254   // A vector of the motion vector information of the frame.
255   // The number of elements is |num_rows_4x4| * |num_cols_4x4|.
256   // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and
257   // |num_cols_4x4| columns.
258   // Each 4x4 block contains 0 motion vector if this is an intra predicted
259   // frame (for example, the key frame). If the frame is inter predicted,
260   // each 4x4 block contains either 1 or 2 motion vectors.
261   // Similar to partition info, all 4x4 blocks inside the same partition block
262   // share the same motion vector information.
263   std::vector<MotionVectorInfo> motion_vector_info;
264   ImageBuffer coded_frame;
265 
266   // recode_count, q_index_history and rate_history are only available when
267   // EncodeFrameWithTargetFrameBits() is used.
268   int recode_count;
269   std::vector<int> q_index_history;
270   std::vector<int> rate_history;
271 };
272 
273 struct GroupOfPicture {
274   // This list will be updated internally in StartEncode() and
275   // EncodeFrame()/EncodeFrameWithQuantizeIndex().
276   // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be
277   // triggered when the coded frame is the last one in the previous group of
278   // pictures.
279   std::vector<EncodeFrameInfo> encode_frame_list;
280 
281   // Indicates the index of the next coding frame in encode_frame_list.
282   // In other words, EncodeFrameInfo of the next coding frame can be
283   // obtained with encode_frame_list[next_encode_frame_index].
284   // Internally, next_encode_frame_index will be set to zero after the last
285   // frame of the group of pictures is coded. Otherwise, next_encode_frame_index
286   // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex()
287   // call.
288   int next_encode_frame_index;
289 
290   // Number of show frames in this group of pictures.
291   int show_frame_count;
292 
293   // The show index/timestamp of the earliest show frame in the group of
294   // pictures.
295   int start_show_index;
296 
297   // The coding index of the first coding frame in the group of pictures.
298   int start_coding_index;
299 
300   // Indicates whether this group of pictures starts with a key frame.
301   int first_is_key_frame;
302 
303   // Indicates whether this group of pictures uses an alt ref.
304   int use_alt_ref;
305 
306   // Indicates whether previous group of pictures used an alt ref.
307   int last_gop_use_alt_ref;
308 };
309 
310 class SimpleEncode {
311  public:
312   // When outfile_path is set, the encoder will output the bitstream in ivf
313   // format.
314   SimpleEncode(int frame_width, int frame_height, int frame_rate_num,
315                int frame_rate_den, int target_bitrate, int num_frames,
316                const char *infile_path, const char *outfile_path = nullptr);
317   ~SimpleEncode();
318   SimpleEncode(SimpleEncode &) = delete;
319   SimpleEncode &operator=(const SimpleEncode &) = delete;
320 
321   // Adjusts the encoder's coding speed.
322   // If this function is not called, the encoder will use default encode_speed
323   // 0. Call this function before ComputeFirstPassStats() if needed.
324   // The encode_speed is equivalent to --cpu-used of the vpxenc command.
325   // The encode_speed's range should be [0, 9].
326   // Setting the encode_speed to a higher level will yield faster coding
327   // at the cost of lower compression efficiency.
328   void SetEncodeSpeed(int encode_speed);
329 
330   // Set encoder config
331   // The following configs in VP9EncoderConfig are allowed to change in this
332   // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each
333   // config's meaning.
334   // Configs in VP9EncoderConfig:      Equivalent configs in ffmpeg:
335   // 1  key_freq                       -g
336   // 2  two_pass_vbrmin_section        -minrate * 100LL / bit_rate
337   // 3  two_pass_vbrmax_section        -maxrate * 100LL / bit_rate
338   // 4  under_shoot_pct                -undershoot-pct
339   // 5  over_shoot_pct                 -overshoot-pct
340   // 6  max_threads                    -threads
341   // 7  frame_parallel_decoding_mode   -frame-parallel
342   // 8  tile_column                    -tile-columns
343   // 9  arnr_max_frames                -arnr-maxframes
344   // 10 arnr_strength                  -arnr-strength
345   // 11 lag_in_frames                  -rc_lookahead
346   // 12 encode_breakout                -static-thresh
347   // 13 enable_tpl_model               -enable-tpl
348   // 14 enable_auto_arf                -auto-alt-ref
349   StatusCode SetEncodeConfig(const char *name, const char *value);
350 
351   // A debug function that dumps configs from VP9EncoderConfig
352   // pass = 1: first pass, pass = 2: second pass
353   // fp: file pointer for dumping config
354   StatusCode DumpEncodeConfigs(int pass, FILE *fp);
355 
356   // Makes encoder compute the first pass stats and store it at
357   // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the
358   // first pass stats.
359   void ComputeFirstPassStats();
360 
361   // Outputs the first pass stats represented by a 2-D vector.
362   // One can use the frame index at first dimension to retrieve the stats for
363   // each video frame. The stats of each video frame is a vector of 25 double
364   // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h
365   std::vector<std::vector<double>> ObserveFirstPassStats();
366 
367   // Outputs the first pass motion vectors represented by a 2-D vector.
368   // One can use the frame index at first dimension to retrieve the mvs for
369   // each video frame. The frame is divided into 16x16 blocks. The number of
370   // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
371   std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors();
372 
373   // Ouputs a copy of key_frame_map_, a binary vector with size equal to the
374   // number of show frames in the video. For each entry in the vector, 1
375   // indicates the position is a key frame and 0 indicates it's not a key frame.
376   // This function should be called after ComputeFirstPassStats()
377   std::vector<int> ObserveKeyFrameMap() const;
378 
379   // Sets group of pictures map for coding the entire video.
380   // Each entry in the gop_map corresponds to a show frame in the video.
381   // Therefore, the size of gop_map should equal to the number of show frames in
382   // the entire video.
383   // If a given entry's kGopMapFlagStart is set, it means this is the start of a
384   // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to
385   // indicate whether this gop use altref.
386   // If a given entry is zero, it means it's in the middle of a gop.
387   // This function should be called only once after ComputeFirstPassStats(),
388   // before StartEncode().
389   // This API will check and modify the gop_map to satisfy the following
390   // constraints.
391   // 1) Each key frame position should be at the start of a gop.
392   // 2) The last gop should not use an alt ref.
393   void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size);
394 
395   // Observe the group of pictures map set through
396   // SetExternalGroupOfPicturesMap(). This function should be called after
397   // SetExternalGroupOfPicturesMap().
398   std::vector<int> ObserveExternalGroupOfPicturesMap();
399 
400   // Initializes the encoder for actual encoding.
401   // This function should be called after ComputeFirstPassStats().
402   void StartEncode();
403 
404   // Frees the encoder.
405   // This function should be called after StartEncode() or EncodeFrame().
406   void EndEncode();
407 
408   // The key frame group size includes one key frame plus the number of
409   // following inter frames. Note that the key frame group size only counts the
410   // show frames. The number of no show frames like alternate refereces are not
411   // counted.
412   int GetKeyFrameGroupSize() const;
413 
414   // Provides the group of pictures that the next coding frame is in.
415   // Only call this function between StartEncode() and EndEncode()
416   GroupOfPicture ObserveGroupOfPicture() const;
417 
418   // Gets encode_frame_info for the next coding frame.
419   // Only call this function between StartEncode() and EndEncode()
420   EncodeFrameInfo GetNextEncodeFrameInfo() const;
421 
422   // Encodes a frame
423   // This function should be called after StartEncode() and before EndEncode().
424   void EncodeFrame(EncodeFrameResult *encode_frame_result);
425 
426   // Encodes a frame with a specific quantize index.
427   // This function should be called after StartEncode() and before EndEncode().
428   void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result,
429                                     int quantize_index);
430 
431   // Encode a frame with target frame bits usage.
432   // The encoder will find a quantize index to make the actual frame bits usage
433   // match the target. EncodeFrameWithTargetFrameBits() will recode the frame
434   // up to 7 times to find a q_index to make the actual_frame_bits satisfy the
435   // following inequality. |actual_frame_bits - target_frame_bits| * 100 /
436   // target_frame_bits
437   // <= percent_diff.
438   void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result,
439                                       int target_frame_bits,
440                                       double percent_diff);
441 
442   // Gets the number of coding frames for the video. The coding frames include
443   // show frame and no show frame.
444   // This function should be called after ComputeFirstPassStats().
445   int GetCodingFrameNum() const;
446 
447   // Gets the total number of pixels of YUV planes per frame.
448   uint64_t GetFramePixelCount() const;
449 
450  private:
451   // Compute the key frame locations of the video based on first pass stats.
452   // The results are returned as a binary vector with 1s indicating keyframes
453   // and 0s indicating non keyframes.
454   // It has to be called after impl_ptr_->first_pass_stats is computed.
455   std::vector<int> ComputeKeyFrameMap() const;
456 
457   // Updates key_frame_group_size_, reset key_frame_group_index_ and init
458   // ref_frame_info_.
459   void UpdateKeyFrameGroup(int key_frame_show_index);
460 
461   // Update key_frame_group_index_.
462   void PostUpdateKeyFrameGroupIndex(FrameType frame_type);
463 
464   void PostUpdateState(const EncodeFrameResult &encode_frame_result);
465 
466   class EncodeImpl;
467 
468   int frame_width_;   // frame width in pixels.
469   int frame_height_;  // frame height in pixels.
470   int frame_rate_num_;
471   int frame_rate_den_;
472   int target_bitrate_;
473   int num_frames_;
474   int encode_speed_;
475 
476   std::FILE *in_file_;
477   std::FILE *out_file_;
478   std::unique_ptr<EncodeImpl> impl_ptr_;
479 
480   std::vector<int> key_frame_map_;
481   std::vector<int> gop_map_;
482   GroupOfPicture group_of_picture_;
483 
484   // The key frame group size includes one key frame plus the number of
485   // following inter frames. Note that the key frame group size only counts the
486   // show frames. The number of no show frames like alternate references are not
487   // counted.
488   int key_frame_group_size_;
489 
490   // The index for the to-be-coded show frame in the key frame group.
491   int key_frame_group_index_;
492 
493   // Each show or no show frame is assigned with a coding index based on its
494   // coding order (starting from zero) in the coding process of the entire
495   // video. The coding index of the to-be-coded frame.
496   int frame_coding_index_;
497 
498   // Number of show frames we have coded so far.
499   int show_frame_count_;
500 
501   // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key
502   // frame appears?
503   // Reference frames info of the to-be-coded frame.
504   RefFrameInfo ref_frame_info_;
505 
506   // A 2-D vector of motion vector information of the frame collected
507   // from the first pass. The first dimension is the frame index.
508   // Each frame is divided into 16x16 blocks. The number of elements is
509   // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
510   // Each 16x16 block contains 0 motion vector if this is an intra predicted
511   // frame (for example, the key frame). If the frame is inter predicted,
512   // each 16x16 block contains either 1 or 2 motion vectors.
513   // The first motion vector is always from the LAST_FRAME.
514   // The second motion vector is always from the GOLDEN_FRAME.
515   std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_;
516 };
517 
518 }  // namespace vp9
519 
520 #endif  // VPX_VP9_SIMPLE_ENCODE_H_
521