1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //  Fast & simple JPEG encoder. Internal header.
16 //
17 // Author: Skal (pascal.massimino@gmail.com)
18 
19 #ifndef SJPEG_JPEGI_H_
20 #define SJPEG_JPEGI_H_
21 
22 #include "sjpeg.h"
23 #include "bit_writer.h"
24 
25 #ifndef NULL
26 #define NULL 0
27 #endif
28 
29 #if defined(__SSE2__)
30 #define SJPEG_USE_SSE2
31 #endif
32 
33 #if defined(__ARM_NEON__) || defined(__aarch64__)
34 #define SJPEG_USE_NEON
35 #endif
36 
37 #if defined(SJPEG_NEED_ASM_HEADERS)
38 #if defined(SJPEG_USE_SSE2)
39 #include <emmintrin.h>
40 #endif
41 
42 #if defined(SJPEG_USE_NEON)
43 #include <arm_neon.h>
44 #endif
45 #endif    // SJPEG_NEED_ASM_HEADERS
46 
47 #include <assert.h>
48 
49 ////////////////////////////////////////////////////////////////////////////////
50 
51 namespace sjpeg {
52 
53 extern bool SupportsSSE2();
54 extern bool SupportsNEON();
55 
56 // Constants below are marker codes defined in JPEG spec
57 // ISO/IEC 10918-1 : 1993(E) Table B.1
58 // See also: http://www.w3.org/Graphics/JPEG/itu-t81.pdf
59 
60 #define M_SOF0  0xffc0
61 #define M_SOF1  0xffc1
62 #define M_DHT   0xffc4
63 #define M_SOI   0xffd8
64 #define M_EOI   0xffd9
65 #define M_SOS   0xffda
66 #define M_DQT   0xffdb
67 
68 // Forward 8x8 Fourier transforms, in-place.
69 typedef void (*FdctFunc)(int16_t *coeffs, int num_blocks);
70 FdctFunc GetFdct();
71 
72 // these are the default luma/chroma matrices (JPEG spec section K.1)
73 extern const uint8_t kDefaultMatrices[2][64];
74 extern const uint8_t kZigzag[64];
75 
76 // scoring tables in score_7.cc
77 extern const int kRGBSize;
78 extern const uint8_t kSharpnessScore[];
79 
80 // internal riskiness scoring functions:
81 extern double DCTRiskinessScore(const int16_t yuv[3 * 8],
82                                 int16_t scores[8 * 8]);
83 extern double BlockRiskinessScore(const uint8_t* rgb, int stride,
84                                   int16_t scores[8 * 8]);
85 extern int YUVToRiskIdx(int16_t y, int16_t u, int16_t v);
86 
87 ///////////////////////////////////////////////////////////////////////////////
88 // RGB->YUV conversion
89 
90 // convert 16x16 RGB block into YUV420, or 8x8 RGB block into YUV444
91 typedef void (*RGBToYUVBlockFunc)(const uint8_t* src, int src_stride,
92                                   int16_t* blocks);
93 extern RGBToYUVBlockFunc GetBlockFunc(bool use_444);
94 
95 // convert a row of RGB samples to YUV444
96 typedef void (*RGBToIndexRowFunc)(const uint8_t* src, int width,
97                                   uint16_t* dst);
98 extern RGBToIndexRowFunc GetRowFunc();
99 
100 // Enhanced slower RGB->YUV conversion:
101 //  y_plane[] has dimension W x H, whereas u_plane[] and v_plane[] have
102 //  dimension (W + 1)/2 x (H + 1)/2.
103 void ApplySharpYUVConversion(const uint8_t* const rgb,
104                              int W, int H, int stride,
105                              uint8_t* y_plane,
106                              uint8_t* u_plane, uint8_t* v_plane);
107 
108 ///////////////////////////////////////////////////////////////////////////////
109 // some useful helper functions around quant matrices
110 
111 extern float GetQFactor(float q);   // convert quality factor -> scale factor
112 extern void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]);
113 extern void SetQuantMatrix(const uint8_t in[64], float q_factor,
114                            uint8_t out[64]);
115 extern void SetMinQuantMatrix(const uint8_t* const m, uint8_t out[64],
116                               int tolerance);
117 
118 ////////////////////////////////////////////////////////////////////////////////
119 // main structs
120 
121 // Huffman tables
122 struct HuffmanTable {
123   uint8_t bits_[16];     // number of symbols per bit count
124   const uint8_t* syms_;  // symbol map, in increasing bit length
125   uint8_t nb_syms_;      // cached value of sum(bits_[])
126 };
127 
128 // quantizer matrices
129 struct Quantizer {
130   uint8_t quant_[64];      // direct quantizer matrix
131   uint8_t min_quant_[64];  // min quantizer value allowed
132   uint16_t iquant_[64];    // precalc'd reciprocal for divisor
133   uint16_t qthresh_[64];   // minimal absolute value that produce non-zero coeff
134   uint16_t bias_[64];      // bias, for coring
135   const uint32_t* codes_;  // codes for bit-cost calculation
136 };
137 
138 // compact Run/Level storage, separate from DCTCoeffs infos
139 // Run/Level Information is not yet entropy-coded, but just stored
140 struct RunLevel {
141   int16_t run_;
142   uint16_t level_;     // 4bits for length, 12bits for mantissa
143 };
144 
145 // short infos about the block of quantized coefficients
146 struct DCTCoeffs {
147   int16_t last_;       // last position (inclusive) of non-zero coeff
148   int16_t nb_coeffs_;  // total number of non-zero AC coeffs
149   uint16_t dc_code_;   // DC code (4bits for length, 12bits for suffix)
150   int8_t idx_;         // component idx
151   int8_t bias_;        // perceptual bias
152 };
153 
154 // Histogram of transform coefficients, for adaptive quant matrices
155 // * HSHIFT controls the trade-off between storage size for counts[]
156 //   and precision: the fdct doesn't descale and returns coefficients as
157 //   signed 16bit value. We are only interested in the absolute values
158 //   of coefficients that are less than MAX_HISTO_DCT_COEFF, which are our
159 //   best contributors.
160 //   Still, storing histogram up to MAX_HISTO_DCT_COEFF can be costly, so
161 //   we further aggregate the statistics in bins of size 1 << HSHIFT to save
162 //   space.
163 // * HLAMBDA roughly measures how much you are willing to trade in distortion
164 //   for a 1-bit gain in filesize.
165 // * QDELTA_MIN / QDELTA_MAX control how much we allow wandering around the
166 //   initial point. This helps reducing the CPU cost, as long as keeping the
167 //   optimization around the initial desired quality-factor (HLAMBDA also
168 //   serve this purpose).
169 enum { HSHIFT = 2,                       // size of bins is (1 << HSHIFT)
170        HHALF = 1 << (HSHIFT - 1),
171        MAX_HISTO_DCT_COEFF = (1 << 7),   // max coefficient, descaled by HSHIFT
172        HLAMBDA = 0x80,
173        // Limits on range of alternate quantizers explored around
174        // the initial value.  (see details in AnalyseHisto())
175        QDELTA_MIN = -12, QDELTA_MAX = 12,
176        QSIZE = QDELTA_MAX + 1 - QDELTA_MIN,
177 };
178 
179 struct Histo {
180   // Reserve one extra entry for counting all coeffs greater than
181   // MAX_HISTO_DCT_COEFF. Result isn't used, but it makes the loop easier.
182   int counts_[64][MAX_HISTO_DCT_COEFF + 1];
183 };
184 
185 ////////////////////////////////////////////////////////////////////////////////
186 
187 struct Encoder {
188  public:
189   Encoder(int W, int H, int step, const uint8_t* rgb, ByteSink* sink);
190   virtual ~Encoder();
OkEncoder191   bool Ok() const { return ok_; }
192 
193   // setters
194   void SetQuality(float q);
195   void SetCompressionMethod(int method);
196 
197   // all-in-one init from EncoderParam.
198   bool InitFromParam(const EncoderParam& param);
199 
200   // Main call. Return false in case of parameter error (setting empty output).
201   bool Encode();
202 
203   // these are colorspace-dependant.
204   virtual void InitComponents() = 0;
205   // return MCU samples at macroblock position (mb_x, mb_y)
206   // clipped is true if the MCU is clipped and needs replication
207   virtual void GetSamples(int mb_x, int mb_y, bool clipped,
208                           int16_t* out_blocks) = 0;
209 
210  private:
211   // setters
212   void SetQuantMatrices(const uint8_t m[2][64]);
213   void SetMinQuantMatrices(const uint8_t m[2][64], int tolerance);
214   void SetDefaultMinQuantMatrices();
215 
216   void SetQuantizationBias(int bias, bool use_adaptive);
217   void SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma);
218 
219   typedef enum { ICC, EXIF, XMP, MARKERS } MetadataType;
220   void SetMetadata(const std::string& data, MetadataType type);
221 
222  private:
223   bool CheckBuffers();  // returns false in case of memory alloc error
224 
225   void WriteAPP0();
226   bool WriteAPPMarkers(const std::string& data);
227   bool WriteEXIF(const std::string& data);
228   bool WriteICCP(const std::string& data);
229   bool WriteXMP(const std::string& data);
230   void WriteDQT();
231   void WriteSOF();
232   void WriteDHT();
233   void WriteSOS();
234   void WriteEOI();
235 
236   void ResetDCs();
237 
238   // collect transformed coeffs (unquantized) only
239   void CollectCoeffs();
240 
241   // 2-pass Huffman optimizing scan
242   void ResetEntropyStats();
243   void AddEntropyStats(const DCTCoeffs* const coeffs,
244                        const RunLevel* const run_levels);
245   void CompileEntropyStats();
246   void StoreOptimalHuffmanTables(size_t nb_mbs, const DCTCoeffs* coeffs);
247 
248   void SinglePassScan();           // finalizing scan
249   void SinglePassScanOptimized();  // optimize the Huffman table + finalize scan
250 
251   // quantize and compute run/levels from already stored coeffs
252   void StoreRunLevels(DCTCoeffs* coeffs);
253   // just write already stored run_levels & coeffs:
254   void FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs);
255 
256   // dichotomy loop
257   void LoopScan();
258 
259   // Histogram pass
260   void CollectHistograms();
261 
262   void BuildHuffmanCodes(const HuffmanTable* const tab,
263                          uint32_t* const codes);
264 
265   typedef int (*QuantizeBlockFunc)(const int16_t in[64], int idx,
266                                    const Quantizer* const Q,
267                                    DCTCoeffs* const out, RunLevel* const rl);
268   static QuantizeBlockFunc quantize_block_;
269   static QuantizeBlockFunc GetQuantizeBlockFunc();
270 
271   static int TrellisQuantizeBlock(const int16_t in[64], int idx,
272                                   const Quantizer* const Q,
273                                   DCTCoeffs* const out,
274                                   RunLevel* const rl);
275 
276   typedef uint32_t (*QuantizeErrorFunc)(const int16_t in[64],
277                                         const Quantizer* const Q);
278   static QuantizeErrorFunc quantize_error_;
279   static QuantizeErrorFunc GetQuantizeErrorFunc();
280 
281   void CodeBlock(const DCTCoeffs* const coeffs, const RunLevel* const rl);
282   // returns DC code (4bits for length, 12bits for suffix), updates DC_predictor
283   static uint16_t GenerateDCDiffCode(int DC, int* const DC_predictor);
284 
285   static void FinalizeQuantMatrix(Quantizer* const q, int bias);
286   void SetCostCodes(int idx);
287   void InitCodes(bool only_ac);
288 
289   size_t HeaderSize() const;
290   void BlocksSize(int nb_mbs, const DCTCoeffs* coeffs,
291                   const RunLevel* rl, sjpeg::BitCounter* const bc) const;
292   float ComputeSize(const DCTCoeffs* coeffs);
293   float ComputePSNR() const;
294 
295  protected:
296   bool SetError();   // sets ok_ to true
297 
298   // format-specific parameters, set by virtual InitComponents()
299   enum { MAX_COMP = 3 };
300   int nb_comps_;
301   int quant_idx_[MAX_COMP];       // indices for quantization matrices
302   int nb_blocks_[MAX_COMP];       // number of 8x8 blocks per components
303   uint8_t block_dims_[MAX_COMP];  // component dimensions (8-pixels units)
304   int block_w_, block_h_;         // maximum mcu width / height
305   int mcu_blocks_;                // total blocks in mcu (= sum of nb_blocks_[])
306 
307   // data accessible to sub-classes implementing alternate input format
308   int W_, H_, step_;    // width, height, stride
309   int mb_w_, mb_h_;     // width / height in units of mcu
310   const uint8_t* const rgb_;   // samples
311 
312   // Replicate an RGB source sub_w x sub_h block, expanding it to w x h size.
313   const uint8_t* GetReplicatedSamples(const uint8_t* rgb,    // block source
314                                       int rgb_step,          // stride in source
315                                       int sub_w, int sub_h,  // sub-block size
316                                       int w, int h);         // size of mcu
317   // Replicate an YUV sub-block similarly.
318   const uint8_t* GetReplicatedYUVSamples(const uint8_t* in, int step,
319                                          int sub_w, int sub_h, int w, int h);
320   // set blocks that are totally outside of the picture to an average value
321   void AverageExtraLuma(int sub_w, int sub_h, int16_t* out);
322   uint8_t replicated_buffer_[3 * 16 * 16];   // tmp buffer for replication
323 
324   sjpeg::RGBToYUVBlockFunc get_yuv_block_;
325   static sjpeg::RGBToYUVBlockFunc get_yuv444_block_;
SetYUVFormatEncoder326   void SetYUVFormat(bool use_444) {
327     get_yuv_block_ = sjpeg::GetBlockFunc(use_444);
328   }
329   bool adaptive_bias_;   // if true, use per-block perceptual bias modulation
330 
331   // Memory management
AllocEncoder332   template<class T> T* Alloc(size_t num) {
333     assert(memory_hook_ != nullptr);
334     T* const ptr = reinterpret_cast<T*>(memory_hook_->Alloc(sizeof(T) * num));
335     if (ptr == nullptr) SetError();
336     return ptr;
337   }
FreeEncoder338   template<class T> void Free(T* const ptr) {
339     memory_hook_->Free(reinterpret_cast<void*>(ptr));
340   }
341 
342  private:
343   bool ok_;                // set to false if a new[] fails
344   sjpeg::BitWriter bw_;    // output buffer
345 
346   std::string iccp_, xmp_, exif_, app_markers_;   // metadata
347 
348   // compression tools. See sjpeg.h for description of methods.
349   bool optimize_size_;        // Huffman-optimize the codes  (method 0, 3)
350   bool use_adaptive_quant_;   // modulate the quant matrix   (method 3-8)
351   bool use_extra_memory_;     // save the unquantized coeffs (method 3, 4)
352   bool reuse_run_levels_;     // save quantized run/levels   (method 1, 4, 5)
353   bool use_trellis_;          // use trellis-quantization    (method 7, 8)
354 
355   int q_bias_;           // [0..255]: rounding bias for quant. of AC coeffs.
356   Quantizer quants_[2];  // quant matrices
357   int DCs_[3];           // DC predictors
358 
359   // DCT coefficients storage, aligned
360   static const size_t ALIGN_CST = 15;
361   uint8_t* in_blocks_base_;   // base memory for blocks
362   int16_t* in_blocks_;        // aligned pointer to in_blocks_base_
363   bool have_coeffs_;          // true if the Fourier coefficients are stored
364   bool AllocateBlocks(size_t num_blocks);  // returns false in case of error
365   void DesallocateBlocks();
366 
367   // these are for regular compression methods 0 or 2.
368   RunLevel base_run_levels_[64];
369 
370   // this is the extra memory for compression method 1
371   RunLevel* all_run_levels_;
372   size_t nb_run_levels_, max_run_levels_;
373 
374   // Huffman_tables_ indices:
375   //  0: luma dc, 1: chroma dc, 2: luma ac, 3: chroma ac
376   const HuffmanTable *Huffman_tables_[4];
377   uint32_t ac_codes_[2][256];
378   uint32_t dc_codes_[2][12];
379 
380   // histograms for dynamic codes. Could be temporaries.
381   uint32_t freq_ac_[2][256 + 1];  // frequency distribution for AC coeffs
382   uint32_t freq_dc_[2][12 + 1];   // frequency distribution for DC coeffs
383   uint8_t opt_syms_ac_[2][256];   // optimal table for AC symbols
384   uint8_t opt_syms_dc_[2][12];    // optimal table for DC symbols
385   HuffmanTable opt_tables_ac_[2];
386   HuffmanTable opt_tables_dc_[2];
387 
388   // Limits on how much we will decrease the bitrate in the luminance
389   // and chrominance channels (respectively).
390   int qdelta_max_luma_;
391   int qdelta_max_chroma_;
392 
393   // Histogram handling
394 
395   // This function aggregates each 63 unquantized AC coefficients into an
396   // histogram for further analysis.
397   typedef void (*StoreHistoFunc)(const int16_t in[64], Histo* const histos,
398                                  int nb_blocks);
399   static StoreHistoFunc store_histo_;
400   static StoreHistoFunc GetStoreHistoFunc();  // select between the above.
401 
402   // Provided the AC histograms have been stored with StoreHisto(), this
403   // function will analyze impact of varying the quantization scales around
404   // initial values, trading distortion for bit-rate in a controlled way.
405   void AnalyseHisto();
406   void ResetHisto();  // initialize histos_[]
407   Histo histos_[2];
408 
409   // multi-pass parameters
410   int passes_;
411   SearchHook default_hook_;
412   SearchHook* search_hook_;
413 
414   // lower memory management
415   MemoryManager* memory_hook_;
416 
417   static const float kHistoWeight[QSIZE];
418 
419   static void (*fDCT_)(int16_t* in, int num_blocks);
420   static void InitializeStaticPointers();
421 };
422 
423 ////////////////////////////////////////////////////////////////////////////////
424 
425 }   // namespace sjpeg
426 
427 #endif    // SJPEG_JPEGI_H_
428