1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //  Fast and simple JPEG encoder
16 //
17 // Author: Skal (pascal.massimino@gmail.com)
18 
19 #include <stdlib.h>
20 #include <math.h>
21 #include <float.h>    // for FLT_MAX
22 #include <stdint.h>
23 
24 #define SJPEG_NEED_ASM_HEADERS
25 #include "sjpegi.h"
26 
27 using namespace sjpeg;
28 
29 // Some general default values:
30 static const float kDefaultQuality = 75.f;
31 static const int kDefaultMethod = 4;
32 // Rounding bias for AC coefficients, as 8bit fixed point.
33 // A default value 0x78 leans toward filesize reduction.
34 static const int32_t kDefaultBias = 0x78;
35 // for adaptive quantization:
36 static const int kDefaultDeltaMaxLuma = 12;
37 static const int kDefaultDeltaMaxChroma = 1;
38 
39 // finer tuning of perceptual optimizations:
40 
41 // Minimum average number of entries per bin required for performing histogram-
42 // -based optimization. Below this limit, the channel's histogram is declared
43 // under-populated and the corresponding optimization skipped.
44 static double kDensityThreshold = 0.5;
45 // Rejection limit on the correlation factor when extrapolating the distortion
46 // from histograms. If the least-square fit has a squared correlation factor
47 // less than this threshold, the corresponding quantization scale will be
48 // kept unchanged.
49 static double kCorrelationThreshold = 0.5;
50 // Bit-map of channels to omit during quantization matrix optimization.
51 // If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
52 // position (i,j) will be kept unchanged during optimization.
53 // The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
54 // corner (with lowest-frequency) are not optimized, since it can lead to
55 // visual degradation of smooth gradients.
56 static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
57 
58 ////////////////////////////////////////////////////////////////////////////////
59 
60 namespace sjpeg {
61 
62 const uint8_t kZigzag[64] = {
63   0,   1,  8, 16,  9,  2,  3, 10,
64   17, 24, 32, 25, 18, 11,  4,  5,
65   12, 19, 26, 33, 40, 48, 41, 34,
66   27, 20, 13,  6,  7, 14, 21, 28,
67   35, 42, 49, 56, 57, 50, 43, 36,
68   29, 22, 15, 23, 30, 37, 44, 51,
69   58, 59, 52, 45, 38, 31, 39, 46,
70   53, 60, 61, 54, 47, 55, 62, 63,
71 };
72 
73 const uint8_t kDefaultMatrices[2][64] = {
74   // these are the default luma/chroma matrices (JPEG spec section K.1)
75   { 16,  11,  10,  16,  24,  40,  51,  61,
76     12,  12,  14,  19,  26,  58,  60,  55,
77     14,  13,  16,  24,  40,  57,  69,  56,
78     14,  17,  22,  29,  51,  87,  80,  62,
79     18,  22,  37,  56,  68, 109, 103,  77,
80     24,  35,  55,  64,  81, 104, 113,  92,
81     49,  64,  78,  87, 103, 121, 120, 101,
82     72,  92,  95,  98, 112, 100, 103,  99 },
83   { 17,  18,  24,  47,  99,  99,  99,  99,
84     18,  21,  26,  66,  99,  99,  99,  99,
85     24,  26,  56,  99,  99,  99,  99,  99,
86     47,  66,  99,  99,  99,  99,  99,  99,
87     99,  99,  99,  99,  99,  99,  99,  99,
88     99,  99,  99,  99,  99,  99,  99,  99,
89     99,  99,  99,  99,  99,  99,  99,  99,
90     99,  99,  99,  99,  99,  99,  99,  99 }
91 };
92 
GetQFactor(float q)93 float GetQFactor(float q) {
94   // we use the same mapping than jpeg-6b, for coherency
95   q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
96   // We floor-round to integer here just to preserve compatibility with jpeg6b.
97   return floorf(q);
98 }
99 
CopyQuantMatrix(const uint8_t in[64],uint8_t out[64])100 void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
101   memcpy(out, in, 64 * sizeof(out[0]));
102 }
103 
SetQuantMatrix(const uint8_t in[64],float q_factor,uint8_t out[64])104 void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
105   if (in == nullptr || out == nullptr) return;
106   q_factor /= 100.f;
107   for (size_t i = 0; i < 64; ++i) {
108     const int v = static_cast<int>(in[i] * q_factor + .5f);
109     // clamp to prevent illegal quantizer values
110     out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
111   }
112 }
113 
SetMinQuantMatrix(const uint8_t m[64],uint8_t out[64],int tolerance)114 void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
115   assert(out != nullptr && m != nullptr);
116   for (size_t i = 0; i < 64; ++i) {
117     const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
118     out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
119   }
120 }
121 
SetDefaultMinQuantMatrix(uint8_t out[64])122 void SetDefaultMinQuantMatrix(uint8_t out[64]) {
123   assert(out != nullptr);
124   for (size_t i = 0; i < 64; ++i) out[i] = 1u;
125 }
126 
127 ////////////////////////////////////////////////////////////////////////////////
128 // Default memory manager (singleton)
129 
130 static struct DefaultMemory : public MemoryManager {
131  public:
~DefaultMemorysjpeg::DefaultMemory132   virtual ~DefaultMemory() {}
Allocsjpeg::DefaultMemory133   virtual void* Alloc(size_t size) { return malloc(size); }
Freesjpeg::DefaultMemory134   virtual void Free(void* const ptr) { free(ptr); }
135 } kDefaultMemory;
136 
137 ////////////////////////////////////////////////////////////////////////////////
138 // Encoder main class
139 
Encoder(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)140 Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
141                  ByteSink* const sink)
142   : W_(W), H_(H), step_(step),
143     rgb_(rgb),
144     ok_(true),
145     bw_(sink),
146     in_blocks_base_(nullptr),
147     in_blocks_(nullptr),
148     have_coeffs_(false),
149     all_run_levels_(nullptr),
150     nb_run_levels_(0),
151     max_run_levels_(0),
152     qdelta_max_luma_(kDefaultDeltaMaxLuma),
153     qdelta_max_chroma_(kDefaultDeltaMaxChroma),
154     passes_(1),
155     search_hook_(nullptr),
156     memory_hook_(&kDefaultMemory) {
157   SetCompressionMethod(kDefaultMethod);
158   SetQuality(kDefaultQuality);
159   SetYUVFormat(false);
160   SetQuantizationBias(kDefaultBias, false);
161   SetDefaultMinQuantMatrices();
162   InitializeStaticPointers();
163   memset(dc_codes_, 0, sizeof(dc_codes_));  // safety
164   memset(ac_codes_, 0, sizeof(ac_codes_));
165 }
166 
~Encoder()167 Encoder::~Encoder() {
168   Free(all_run_levels_);
169   DesallocateBlocks();   // clean-up leftovers in case of we had an error
170 }
171 
172 ////////////////////////////////////////////////////////////////////////////////
173 
SetQuality(float q)174 void Encoder::SetQuality(float q) {
175   q = GetQFactor(q);
176   SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
177   SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
178 }
179 
SetQuantMatrices(const uint8_t m[2][64])180 void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
181   SetQuantMatrix(m[0], 100, quants_[0].quant_);
182   SetQuantMatrix(m[1], 100, quants_[1].quant_);
183 }
184 
SetMinQuantMatrices(const uint8_t m[2][64],int tolerance)185 void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
186   SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
187   SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
188 }
189 
SetDefaultMinQuantMatrices()190 void Encoder::SetDefaultMinQuantMatrices() {
191   SetDefaultMinQuantMatrix(quants_[0].min_quant_);
192   SetDefaultMinQuantMatrix(quants_[1].min_quant_);
193 }
194 
SetCompressionMethod(int method)195 void Encoder::SetCompressionMethod(int method) {
196   assert(method >= 0 && method <= 8);
197   use_adaptive_quant_ = (method >= 3);
198   optimize_size_ = (method != 0) && (method != 3);
199   use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
200   reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
201                    || (method >= 7);
202   use_trellis_ = (method >= 7);
203 }
204 
SetMetadata(const std::string & data,MetadataType type)205 void Encoder::SetMetadata(const std::string& data, MetadataType type) {
206   switch (type) {
207     case ICC: iccp_ = data; break;
208     case EXIF: exif_ = data; break;
209     case XMP: xmp_ = data; break;
210     default:
211     case MARKERS: app_markers_ = data; break;
212   }
213 }
214 
SetQuantizationBias(int bias,bool use_adaptive)215 void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
216   assert(bias >= 0 && bias <= 255);
217   q_bias_ = bias;
218   adaptive_bias_ = use_adaptive;
219 }
220 
SetQuantizationDeltas(int qdelta_luma,int qdelta_chroma)221 void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
222   assert(qdelta_luma >= 0 && qdelta_luma <= 255);
223   assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
224   qdelta_max_luma_ = qdelta_luma;
225   qdelta_max_chroma_ = qdelta_chroma;
226 }
227 
228 ////////////////////////////////////////////////////////////////////////////////
229 // CPU support
230 
231 extern bool ForceSlowCImplementation;
232 bool ForceSlowCImplementation = false;   // undocumented! for tests.
233 
SupportsSSE2()234 bool SupportsSSE2() {
235   if (ForceSlowCImplementation) return false;
236 #if defined(SJPEG_USE_SSE2)
237   return true;
238 #endif
239   return false;
240 }
241 
SupportsNEON()242 bool SupportsNEON() {
243   if (ForceSlowCImplementation) return false;
244 #if defined(SJPEG_USE_NEON)
245   return true;
246 #endif
247   return false;
248 }
249 
250 ////////////////////////////////////////////////////////////////////////////////
251 // static pointers to architecture-dependant implementation
252 
253 Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
254 Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
255 void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
256 Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
257 RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
258 
InitializeStaticPointers()259 void Encoder::InitializeStaticPointers() {
260   if (fDCT_ == nullptr) {
261     store_histo_ = GetStoreHistoFunc();
262     quantize_block_ = GetQuantizeBlockFunc();
263     quantize_error_ = GetQuantizeErrorFunc();
264     fDCT_ = GetFdct();
265     get_yuv444_block_ = GetBlockFunc(true);
266   }
267 }
268 
269 ////////////////////////////////////////////////////////////////////////////////
270 // memory and internal buffers management. We grow on demand.
271 
SetError()272 bool Encoder::SetError() {
273   ok_ = false;
274   return false;
275 }
276 
CheckBuffers()277 bool Encoder::CheckBuffers() {
278   // maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
279   ok_ = ok_ && bw_.Reserve(2048);
280   if (!ok_) return false;
281 
282   if (reuse_run_levels_) {
283     if (nb_run_levels_ + 6*64 > max_run_levels_) {
284       // need to grow storage for run/levels
285       const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
286       RunLevel* const new_rl = Alloc<RunLevel>(new_size);
287       if (new_rl == nullptr) return false;
288       if (nb_run_levels_ > 0) {
289         memcpy(new_rl, all_run_levels_,
290                nb_run_levels_ * sizeof(new_rl[0]));
291       }
292       Free(all_run_levels_);
293       all_run_levels_ = new_rl;
294       max_run_levels_ = new_size;
295       assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
296     }
297   }
298   return true;
299 }
300 
AllocateBlocks(size_t num_blocks)301 bool Encoder::AllocateBlocks(size_t num_blocks) {
302   assert(in_blocks_ == nullptr);
303   have_coeffs_ = false;
304   const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
305   in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
306   if (in_blocks_base_ == nullptr) return false;
307   in_blocks_ = reinterpret_cast<int16_t*>(
308       (ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
309   return true;
310 }
311 
DesallocateBlocks()312 void Encoder::DesallocateBlocks() {
313   Free(in_blocks_base_);
314   in_blocks_base_ = nullptr;
315   in_blocks_ = nullptr;          // sanity
316 }
317 
318 ////////////////////////////////////////////////////////////////////////////////
319 
320 #define FP_BITS 16    // fractional precision for fixed-point dividors
321 #define AC_BITS 4     // extra precision bits from fdct's scaling
322 #define BIAS_DC 0x80  // neutral bias for DC (mandatory!)
323 
324 // divide-by-multiply helper macros
325 #define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
326 #define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
327 #define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
328 
FinalizeQuantMatrix(Quantizer * const q,int q_bias)329 void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
330   // first, clamp the quant matrix:
331   for (size_t i = 0; i < 64; ++i) {
332     if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
333   }
334   // Special case! for v=1 we can't represent the multiplier with 16b precision.
335   // So, instead we max out the multiplier to 0xffffu, and twist the bias to the
336   // value 0x80. The overall precision isn't affected: it's bit-exact the same
337   // for our working range.
338   // Note that quant=1 can start appearing at quality as low as 93.
339   const uint16_t bias_1 = 0x80;
340   const uint16_t iquant_1 = 0xffffu;
341   for (size_t i = 0; i < 64; ++i) {
342     const uint16_t v = q->quant_[i];
343     const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
344     const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
345     const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
346     const uint16_t qthresh =
347         ((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
348     q->bias_[i] = ibias;
349     q->iquant_[i] = iquant;
350     q->qthresh_[i] = qthresh;
351     assert(QUANTIZE(qthresh, iquant, ibias) > 0);
352     assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
353   }
354 }
355 
SetCostCodes(int idx)356 void Encoder::SetCostCodes(int idx) {
357   quants_[idx].codes_ = ac_codes_[idx];
358 }
359 
360 ////////////////////////////////////////////////////////////////////////////////
361 // standard Huffman tables, as per JPEG standard section K.3.
362 
363 static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
364 static const uint8_t kACSyms[2][162] = {
365   { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
366     0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
367     0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
368     0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
369     0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
370     0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
371     0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
372     0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
373     0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
374     0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
375     0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
376     0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
377     0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
378     0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
379     0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
380     0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
381     0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
382     0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
383     0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
384     0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
385     0xf9, 0xfa },
386   { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
387     0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
388     0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
389     0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
390     0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
391     0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
392     0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
393     0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
394     0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
395     0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
396     0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
397     0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
398     0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
399     0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
400     0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
401     0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
402     0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
403     0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
404     0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
405     0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
406     0xf9, 0xfa }
407 };
408 
409 static const HuffmanTable kHuffmanTables[4] = {
410   { { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
411   { { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
412   { { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
413   { { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
414 };
415 
416 ////////////////////////////////////////////////////////////////////////////////
417 // This function generates a map from symbols to code + len stored in a packed
418 // way (lower 16bit is the lenth, upper 16bit is the VLC).
419 // The input is a JPEG-like description of the symbols:
420 // - bits[i] stores the number of codes having length i + 1.
421 // - symbols[] contain the symbols' map, in increasing bit-length order.
422 // There is no check performed on the validity symbols[]'s content.
423 // The values of tab[] not referring to an actual symbol will remain unchanged.
424 // Returns the number of symbols used (that is: sum{bits[i]})
425 
BuildHuffmanTable(const uint8_t bits[16],const uint8_t * symbols,uint32_t * const tab)426 static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
427                              uint32_t* const tab) {
428   uint32_t code = 0;
429   int nb = 0;
430   for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
431     int n = bits[nb_bits - 1];  // number of code for that given nb_bits
432     nb += n;
433     while (n-- > 0) {
434       const int symbol = *symbols++;
435       tab[symbol] = (code << 16) | nb_bits;
436       ++code;
437     }
438   }
439   return nb;
440 }
441 
442 ////////////////////////////////////////////////////////////////////////////////
443 
InitCodes(bool only_ac)444 void Encoder::InitCodes(bool only_ac) {
445   const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
446   for (int c = 0; c < nb_tables; ++c) {   // luma, chroma
447     for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
448       const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
449       const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
450                                             type == 1 ? ac_codes_[c]
451                                                       : dc_codes_[c]);
452       assert(nb_syms == h->nb_syms_);
453       (void)nb_syms;
454     }
455   }
456 }
457 
458 ////////////////////////////////////////////////////////////////////////////////
459 // Quantize coefficients and pseudo-code coefficients
460 
CalcLog2(int v)461 static int CalcLog2(int v) {
462 #if defined(__GNUC__) && \
463     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
464   return 32 - __builtin_clz(v);
465 #else
466   const int kLog2[16] = {
467     0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
468   assert(v > 0 && v < (1 << 12));
469   return (v & ~0xff) ? 8 + kLog2[v >> 8] :
470          (v & ~0x0f) ? 4 + kLog2[v >> 4] :
471                        0 + kLog2[v];
472 #endif
473 }
474 
GenerateDCDiffCode(int DC,int * const DC_predictor)475 uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
476   const int diff = DC - *DC_predictor;
477   *DC_predictor = DC;
478   if (diff == 0) {
479     return 0;
480   }
481   int suff, n;
482   if (diff < 0) {
483     n = CalcLog2(-diff);
484     suff = (diff - 1) & ((1 << n) - 1);
485   } else {
486     n = CalcLog2(diff);
487     suff = diff;
488   }
489   assert((suff & 0xf000) == 0);
490   assert(n < 12);
491   return n | (suff << 4);
492 }
493 
494 ////////////////////////////////////////////////////////////////////////////////
495 // various implementation of histogram collection
496 
497 #if defined(SJPEG_USE_SSE2)
498 // Load eight 16b-words from *src.
499 #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
500 // Store eight 16b-words into *dst
501 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
502 
QuantizeBlockSSE2(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)503 static int QuantizeBlockSSE2(const int16_t in[64], int idx,
504                              const Quantizer* const Q,
505                              DCTCoeffs* const out, RunLevel* const rl) {
506   const uint16_t* const bias = Q->bias_;
507   const uint16_t* const iquant = Q->iquant_;
508   int prev = 1;
509   int nb = 0;
510   int16_t tmp[64], masked[64];
511   for (int i = 0; i < 64; i += 8) {
512     const __m128i m_bias = LOAD_16(bias + i);
513     const __m128i m_mult = LOAD_16(iquant + i);
514     const __m128i A = LOAD_16(in + i);                        // A = in[i]
515     const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
516     const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(A)
517     const __m128i D = _mm_adds_epi16(C, m_bias);              // v' = v + bias
518     const __m128i E = _mm_mulhi_epu16(D, m_mult);             // (v' * iq) >> 16
519     const __m128i F = _mm_srli_epi16(E, AC_BITS);             // = QUANTIZE(...)
520     const __m128i G = _mm_xor_si128(F, B);                    // v ^ mask
521     STORE_16(F, tmp + i);
522     STORE_16(G, masked + i);
523   }
524   for (int i = 1; i < 64; ++i) {
525     const int j = kZigzag[i];
526     const int v = tmp[j];
527     if (v > 0) {
528       const int n = CalcLog2(v);
529       const uint16_t code = masked[j] & ((1 << n) - 1);
530       rl[nb].level_ = (code << 4) | n;
531       rl[nb].run_ = i - prev;
532       prev = i + 1;
533       ++nb;
534     }
535   }
536   const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
537   out->idx_ = idx;
538   out->last_ = prev - 1;
539   out->nb_coeffs_ = nb;
540   return dc;
541 }
542 #undef LOAD_16
543 #undef STORE_16
544 
545 #elif defined(SJPEG_USE_NEON)
QuantizeBlockNEON(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)546 static int QuantizeBlockNEON(const int16_t in[64], int idx,
547                              const Quantizer* const Q,
548                              DCTCoeffs* const out, RunLevel* const rl) {
549   const uint16_t* const bias = Q->bias_;
550   const uint16_t* const iquant = Q->iquant_;
551   int prev = 1;
552   int nb = 0;
553   uint16_t tmp[64], masked[64];
554   for (int i = 0; i < 64; i += 8) {
555     const uint16x8_t m_bias = vld1q_u16(bias + i);
556     const uint16x8_t m_mult = vld1q_u16(iquant + i);
557     const int16x8_t A = vld1q_s16(in + i);                           // in[i]
558     const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A));        // abs(in)
559     const int16x8_t sign = vshrq_n_s16(A, 15);                       // sign
560     const uint16x8_t C = vaddq_u16(B, m_bias);                       // + bias
561     const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
562     const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
563     // collect hi-words of the 32b mult result using 'unzip'
564     const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
565                                      vreinterpretq_u16_u32(D1));
566     const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
567     const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign));  // v ^ mask
568     vst1q_u16(tmp + i, F);
569     vst1q_u16(masked + i, G);
570   }
571   for (int i = 1; i < 64; ++i) {
572     const int j = kZigzag[i];
573     const int v = tmp[j];
574     if (v > 0) {
575       const int n = CalcLog2(v);
576       const uint16_t code = masked[j] & ((1 << n) - 1);
577       rl[nb].level_ = (code << 4) | n;
578       rl[nb].run_ = i - prev;
579       prev = i + 1;
580       ++nb;
581     }
582   }
583   const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
584   out->idx_ = idx;
585   out->last_ = prev - 1;
586   out->nb_coeffs_ = nb;
587   return dc;
588 }
589 #endif    // SJPEG_USE_NEON
590 
QuantizeBlock(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)591 static int QuantizeBlock(const int16_t in[64], int idx,
592                          const Quantizer* const Q,
593                          DCTCoeffs* const out, RunLevel* const rl) {
594   const uint16_t* const bias = Q->bias_;
595   const uint16_t* const iquant = Q->iquant_;
596   int prev = 1;
597   int nb = 0;
598   // This function is speed-critical, so we're using some bit mask
599   // to extract absolute values, instead of sign tests.
600   const uint16_t* const qthresh = Q->qthresh_;
601   for (int i = 1; i < 64; ++i) {
602     const int j = kZigzag[i];
603     int v = in[j];
604     const int32_t mask = v >> 31;
605     v = (v ^ mask) - mask;
606     if (v >= qthresh[j]) {
607       v = QUANTIZE(v, iquant[j], bias[j]);
608       assert(v > 0);
609       const int n = CalcLog2(v);
610       const uint16_t code = (v ^ mask) & ((1 << n) - 1);
611       rl[nb].level_ = (code << 4) | n;
612       rl[nb].run_ = i - prev;
613       prev = i + 1;
614       ++nb;
615     }
616   }
617   const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
618                              : QUANTIZE(in[0], iquant[0], bias[0]);
619   out->idx_ = idx;
620   out->last_ = prev - 1;
621   out->nb_coeffs_ = nb;
622   return dc;
623 }
624 
625 ////////////////////////////////////////////////////////////////////////////////
626 // Trellis-based quantization
627 
628 typedef uint32_t score_t;
629 static const score_t kMaxScore = 0xffffffffu;
630 
631 struct TrellisNode {
632   uint32_t code;
633   int      nbits;
634   score_t score;
635   uint32_t disto;
636   uint32_t bits;
637   uint32_t run;
638   const TrellisNode* best_prev;
639   int pos;
640   int rank;
641 
TrellisNodesjpeg::TrellisNode642   TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
InitSinksjpeg::TrellisNode643   void InitSink() {
644     score = 0u;
645     disto = 0;
646     pos = 0;
647     rank = 0;
648     nbits = 0;
649     bits = 0;
650   }
651 };
652 
SearchBestPrev(const TrellisNode * const nodes0,TrellisNode * node,const uint32_t disto0[],const uint32_t codes[],uint32_t lambda)653 static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
654                            const uint32_t disto0[], const uint32_t codes[],
655                            uint32_t lambda) {
656   bool found = false;
657   assert(codes[0xf0] != 0);
658   const uint32_t base_disto = node->disto + disto0[node->pos - 1];
659   for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
660     const int run = node->pos - 1 - cur->pos;
661     if (run < 0) continue;
662     uint32_t bits = node->nbits;
663     bits += (run >> 4) * (codes[0xf0] & 0xff);
664     const uint32_t sym = ((run & 15) << 4) | node->nbits;
665     assert(codes[sym] != 0);
666     bits += codes[sym] & 0xff;
667     const uint32_t disto = base_disto - disto0[cur->pos];
668     const score_t score = disto + lambda * bits + cur->score;
669     if (score < node->score) {
670       node->score = score;
671       node->disto = disto;
672       node->bits = bits;
673       node->best_prev = cur;
674       node->rank = cur->rank + 1;
675       node->run = run;
676       found = true;
677     }
678   }
679   return found;
680 }
681 
682 // number of alternate levels to investigate
683 #define NUM_TRELLIS_NODES 2
684 
TrellisQuantizeBlock(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)685 int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
686                                   const Quantizer* const Q,
687                                   DCTCoeffs* const out,
688                                   RunLevel* const rl) {
689   const uint16_t* const bias = Q->bias_;
690   const uint16_t* const iquant = Q->iquant_;
691   TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63];  // 1 sink + n channels
692   nodes[0].InitSink();
693   const uint32_t* const codes = Q->codes_;
694   TrellisNode* cur_node = &nodes[1];
695   uint32_t disto0[64];   // disto0[i] = sum of distortions up to i (inclusive)
696   disto0[0] = 0;
697   for (int i = 1; i < 64; ++i) {
698     const int j = kZigzag[i];
699     const uint32_t q = Q->quant_[j] << AC_BITS;
700     const uint32_t lambda = q * q / 32u;
701     int V = in[j];
702     const int32_t mask = V >> 31;
703     V = (V ^ mask) - mask;
704     disto0[i] = V * V + disto0[i - 1];
705     int v = QUANTIZE(V, iquant[j], bias[j]);
706     if (v == 0) continue;
707     int nbits = CalcLog2(v);
708     for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
709       const int err = V - v * q;
710       cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
711       cur_node->pos = i;
712       cur_node->disto = err * err;
713       cur_node->nbits = nbits;
714       cur_node->score = kMaxScore;
715       if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
716         ++cur_node;
717       }
718       --nbits;
719       if (nbits <= 0) break;
720       v = (1 << nbits) - 1;
721     }
722   }
723   // search best entry point backward
724   const TrellisNode* nz = &nodes[0];
725   if (cur_node != nz) {
726     score_t best_score = kMaxScore;
727     while (cur_node-- != &nodes[0]) {
728       const uint32_t disto = disto0[63] - disto0[cur_node->pos];
729       // No need to incorporate EOB's bit cost (codes[0x00]), since
730       // it's the same for all coeff except the last one #63.
731       cur_node->disto += disto;
732       cur_node->score += disto;
733       if (cur_node->score < best_score) {
734         nz = cur_node;
735         best_score = cur_node->score;
736       }
737     }
738   }
739   int nb = nz->rank;
740   out->idx_ = idx;
741   out->last_ = nz->pos;
742   out->nb_coeffs_ = nb;
743 
744   while (nb-- > 0) {
745     const int32_t code = nz->code;
746     const int n = nz->nbits;
747     rl[nb].level_ = (code << 4) | n;
748     rl[nb].run_ = nz->run;
749     nz = nz->best_prev;
750   }
751   const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
752                              : QUANTIZE(in[0], iquant[0], bias[0]);
753   return dc;
754 }
755 
GetQuantizeBlockFunc()756 Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
757 #if defined(SJPEG_USE_SSE2)
758   if (SupportsSSE2()) return QuantizeBlockSSE2;
759 #elif defined(SJPEG_USE_NEON)
760   if (SupportsNEON()) return QuantizeBlockNEON;
761 #endif
762   return QuantizeBlock;  // default
763 }
764 
765 ////////////////////////////////////////////////////////////////////////////////
766 
767 #if defined(SJPEG_USE_SSE2)
768 // Load eight 16b-words from *src.
769 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
770 #define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
771 // Store eight 16b-words into *dst
772 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
773 
QuantizeErrorSSE2(const int16_t in[64],const Quantizer * const Q)774 static uint32_t QuantizeErrorSSE2(const int16_t in[64],
775                                   const Quantizer* const Q) {
776   const uint16_t* const bias = Q->bias_;
777   const uint16_t* const iquant = Q->iquant_;
778   const uint8_t* const quant = Q->quant_;
779   const __m128i zero = _mm_setzero_si128();
780   uint32_t tmp[32];
781   for (int i = 0; i < 64; i += 8) {
782     const __m128i m_bias = LOAD_16(bias + i);
783     const __m128i m_iquant = LOAD_16(iquant + i);
784     const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
785     const __m128i A = LOAD_16(in + i);                        // v0 = in[i]
786     const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
787     const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(v0)
788     const __m128i D = _mm_adds_epi16(C, m_bias);              // v' = v0 + bias
789     const __m128i E = _mm_mulhi_epu16(D, m_iquant);           // (v' * iq) >> 16
790     const __m128i F = _mm_srai_epi16(E, AC_BITS);
791     const __m128i G = _mm_srai_epi16(C, AC_BITS);
792     const __m128i H = _mm_mullo_epi16(F, m_quant);            // *= quant[j]
793     const __m128i I = _mm_sub_epi16(G, H);
794     const __m128i J = _mm_madd_epi16(I, I);                   // (v0-v) ^ 2
795     STORE_16(J, tmp + i / 2);
796   }
797   uint32_t err = 0;
798   for (int i = 0; i < 32; ++i) err += tmp[i];
799   return err;
800 }
801 #undef LOAD_16
802 #undef LOAD_64
803 #undef STORE_16
804 
805 #elif defined(SJPEG_USE_NEON)
806 
QuantizeErrorNEON(const int16_t in[64],const Quantizer * const Q)807 static uint32_t QuantizeErrorNEON(const int16_t in[64],
808                                   const Quantizer* const Q) {
809   const uint16_t* const bias = Q->bias_;
810   const uint16_t* const iquant = Q->iquant_;
811   const uint8_t* const quant = Q->quant_;
812   uint32x4_t sum1 = vdupq_n_u32(0);
813   uint32x4_t sum2 = vdupq_n_u32(0);
814   for (int i = 0; i < 64; i += 8) {
815     const uint16x8_t m_bias = vld1q_u16(bias + i);
816     const uint16x8_t m_mult = vld1q_u16(iquant + i);
817     const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
818     const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
819     const uint16x8_t B = vaddq_u16(A, m_bias);
820     const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
821     const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
822     // collect hi-words of the 32b mult result using 'unzip'
823     const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
824                                      vreinterpretq_u16_u32(C1));
825     const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
826     const uint16x8_t F = vmulq_u16(E, m_quant);        // dequantized coeff
827     const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
828     sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
829     sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
830   }
831   const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
832   const uint64x2_t sum4 = vpaddlq_u32(sum3);
833   const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
834   const uint32_t err = (uint32_t)sum5;
835   return err;
836 }
837 
838 #endif    // SJPEG_USE_NEON
839 
QuantizeError(const int16_t in[64],const Quantizer * const Q)840 static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
841   const uint16_t* const bias = Q->bias_;
842   const uint16_t* const iquant = Q->iquant_;
843   const uint8_t* const quant = Q->quant_;
844   uint32_t err = 0;
845   for (int j = 0; j < 64; ++j) {
846     int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
847     const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
848     v0 >>= AC_BITS;
849     err += (v0 - v) * (v0 - v);
850   }
851   return err;
852 }
853 
GetQuantizeErrorFunc()854 Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
855 #if defined(SJPEG_USE_SSE2)
856   if (SupportsSSE2()) return QuantizeErrorSSE2;
857 #elif defined(SJPEG_USE_NEON)
858   if (SupportsNEON()) return QuantizeErrorNEON;
859 #endif
860   return QuantizeError;  // default
861 }
862 
863 ////////////////////////////////////////////////////////////////////////////////
864 // Code bitstream
865 
ResetDCs()866 void Encoder::ResetDCs() {
867   for (int c = 0; c < nb_comps_; ++c) {
868     DCs_[c] = 0;
869   }
870 }
871 
CodeBlock(const DCTCoeffs * const coeffs,const RunLevel * const rl)872 void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
873                         const RunLevel* const rl) {
874   const int idx = coeffs->idx_;
875   const int q_idx = quant_idx_[idx];
876 
877   // DC coefficient symbol
878   const int dc_len = coeffs->dc_code_ & 0x0f;
879   const uint32_t code = dc_codes_[q_idx][dc_len];
880   bw_.PutPackedCode(code);
881   if (dc_len > 0) {
882     bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
883   }
884 
885   // AC coeffs
886   const uint32_t* const codes = ac_codes_[q_idx];
887   for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
888     int run = rl[i].run_;
889     while (run & ~15) {        // escapes
890       bw_.PutPackedCode(codes[0xf0]);
891       run -= 16;
892     }
893     const uint32_t suffix = rl[i].level_;
894     const int n = suffix & 0x0f;
895     const int sym = (run << 4) | n;
896     bw_.PutPackedCode(codes[sym]);
897     bw_.PutBits(suffix >> 4, n);
898   }
899   if (coeffs->last_ < 63) {     // EOB
900     bw_.PutPackedCode(codes[0x00]);
901   }
902 }
903 
904 ////////////////////////////////////////////////////////////////////////////////
905 // Histogram
906 
ResetHisto()907 void Encoder::ResetHisto() {
908   memset(histos_, 0, sizeof(histos_));
909 }
910 
911 #if defined(SJPEG_USE_SSE2)
StoreHistoSSE2(const int16_t in[64],Histo * const histos,int nb_blocks)912 void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
913   const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
914   for (int n = 0; n < nb_blocks; ++n, in += 64) {
915     uint16_t tmp[64];
916     for (int i = 0; i < 64; i += 8) {
917       const __m128i A =
918           _mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
919       const __m128i B = _mm_srai_epi16(A, 15);                  // sign extract
920       const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B);  // abs(A)
921       const __m128i D = _mm_srli_epi16(C, HSHIFT);              // >>= HSHIFT
922       const __m128i E = _mm_min_epi16(D, kMaxHisto);
923       _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
924     }
925     for (int j = 0; j < 64; ++j) {
926       const int k = tmp[j];
927       ++histos->counts_[j][k];
928     }
929   }
930 }
931 #elif defined(SJPEG_USE_NEON)
StoreHistoNEON(const int16_t in[64],Histo * const histos,int nb_blocks)932 void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
933   const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
934   for (int n = 0; n < nb_blocks; ++n, in += 64) {
935     uint16_t tmp[64];
936     for (int i = 0; i < 64; i += 8) {
937       const int16x8_t A = vld1q_s16(in + i);
938       const int16x8_t B = vabsq_s16(A);               // abs(in)
939       const uint16x8_t C = vreinterpretq_u16_s16(B);  // signed->unsigned
940       const uint16x8_t D = vshrq_n_u16(C, HSHIFT);    // >>= HSHIFT
941       const uint16x8_t E = vminq_u16(D, kMaxHisto);   // min(.,kMaxHisto)
942       vst1q_u16(tmp + i, E);
943     }
944     for (int j = 0; j < 64; ++j) {
945       const int k = tmp[j];
946       ++histos->counts_[j][k];
947     }
948   }
949 }
950 #endif
951 
952 // This C-version is does not produce the same counts_[] output than the
953 // assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
954 // not used for the final computation, and the global result is unchanged.
StoreHisto(const int16_t in[64],Histo * const histos,int nb_blocks)955 void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
956   for (int n = 0; n < nb_blocks; ++n, in += 64) {
957     for (int i = 0; i < 64; ++i) {
958       const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
959       if (k < MAX_HISTO_DCT_COEFF) {
960         ++histos->counts_[i][k];
961       }
962     }
963   }
964 }
965 
GetStoreHistoFunc()966 Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
967 #if defined(SJPEG_USE_SSE2)
968   if (SupportsSSE2()) return StoreHistoSSE2;
969 #elif defined(SJPEG_USE_NEON)
970   if (SupportsNEON()) return StoreHistoNEON;
971 #endif
972   return StoreHisto;  // default
973 }
974 
975 const float Encoder::kHistoWeight[QSIZE] = {
976   // Gaussian with sigma ~= 3
977   0, 0, 0, 0, 0,
978   1,   5,  16,  43,  94, 164, 228, 255, 228, 164,  94,  43,  16,   5,   1,
979   0, 0, 0, 0, 0
980 };
981 
AnalyseHisto()982 void Encoder::AnalyseHisto() {
983   // A bit of theory and background: for each sub-band i in [0..63], we pick a
984   // quantization scale New_Qi close to the initial one Qi. We evaluate a cost
985   // function associated with F({New_Qi}) = distortion + lambda . rate,
986   // where rate and distortion depend on the quantizers set in a complex non-
987   // analytic way. Just, for well-behaved regular histograms, we expect the
988   // rate to scale as -log(Q), and the distortion as Q^2.
989   // We want the cost function to be stationnary around the initial {Qi} set,
990   // in order to achieve the best transfer between distortion and rate when we
991   // displace a little the Qi values. Mainly we want to use bits as efficiently
992   // as possible, where every bit we use has maximal impact in lowering
993   // distortion (and vice versa: if we spend an extra bit of coding, we want to
994   // have the best bang for this buck. The optimization works up-hill too).
995   //
996   // Hence, lambda is picked to minimize F around {Qi}, as:
997   //    lambda = -d(distortion) / d(rate)
998   // where the derivates are evaluated using a double least-square fit on both
999   // the clouds of {delta, distortion} and {delta, size} points.
1000   //
1001   // Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
1002   //    slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
1003   // where <.> is our gaussian-averaging operator.
1004   // But since we are eventually computing a quotient of such slopes, we can
1005   // factor out the common (<xx> - <x><x>) denominator (which is strictly
1006   // positive).
1007   // Note2: we use a Gaussian-weighted average around the center value Qi
1008   // instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
1009   // This rules out fringe samples on noisy cases (like: when the source is
1010   // already JPEG-compressed!).
1011   // Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
1012   //
1013   // We use use the correlation coefficient
1014   //       r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
1015   // to detect bad cases with poorly extrapolated distortion. In such
1016   // occurrence, we skip the channel. This is particularly important for
1017   // already-compressed JPEG sources that give treacherous comb-like
1018   // histograms.
1019   //
1020   // Once this particular lambda has been picked, we loop over each channel
1021   // and optimize them separately, locally picking the best New_Qi for each.
1022   // The choice of lambda ensure a good balancing between size and distortion,
1023   // and prevent being too aggressive on file-size reduction for instance.
1024   //
1025   const double r_limit = kCorrelationThreshold;
1026   for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
1027     const int idx = quant_idx_[c];
1028     const Histo* const histo = &histos_[idx];
1029     // For chrominance, it can be visually damageable to be too
1030     // aggressive on the filesize. So with the default settings we
1031     // restrict the algorithm to mainly try to *increase* the bitrate
1032     // (and quality) by using a smaller qdelta_max_chroma_.
1033     // delta_max is only use during the second phase, but not during
1034     // the first phase of deriving an optimal lambda.
1035     assert(QDELTA_MAX >= qdelta_max_luma_);
1036     assert(QDELTA_MAX >= qdelta_max_chroma_);
1037     const int delta_max =
1038       ((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
1039     assert(delta_max < QSIZE);
1040     float sizes[64][QSIZE];
1041     float distortions[64][QSIZE];
1042     double num = 0.;  // accumulate d(distortion) around delta_q = 0
1043     double den = 0.;  // accumulate d(size) around delta_q = 0
1044     uint64_t omit_channels = kOmittedChannels;
1045     for (int pos = 0; pos < 64; ++pos) {
1046       if (omit_channels & (1ULL << pos)) {
1047         continue;
1048       }
1049       const int dq0 = quants_[idx].quant_[pos];
1050       const int min_dq0 = quants_[idx].min_quant_[pos];
1051       // We should be using the exact bias:
1052       //    const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
1053       // but this value is too precise considering the other approximations
1054       // we're using (namely: HSHIFT). So we better use the a mid value of 0.5
1055       // for the bias. This have the advantage of making it possible to
1056       // use pre-calculated look-up tables for every quantities in the loop.
1057       // This is still a TODO(skal) below, though. Not sure the gain is big.
1058       const int bias = 1 << FP_BITS >> 1;
1059       const int* const h = histo->counts_[pos];
1060       int total = 0;
1061       int last = 0;
1062       for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
1063         total += h[i];
1064         if (h[i]) last = i + 1;
1065       }
1066       if (total < kDensityThreshold * last) {
1067         omit_channels |= 1ULL << pos;
1068         continue;
1069       }
1070       // accumulators for averaged values.
1071       double sw = 0., sx = 0.;
1072       double sxx = 0., syy1 = 0.;
1073       double sy1 = 0., sxy1 = 0.;   // accumulators for distortion cloud
1074       double sy2 = 0., sxy2 = 0.;   // accumulators for size cloud
1075       for (int delta = 0; delta < QSIZE; ++delta) {
1076         double bsum = 0., dsum = 0.;
1077         const int dq = dq0 + (delta + QDELTA_MIN);
1078         if (dq >= min_dq0 && dq <= 255) {
1079           // TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
1080           const int idq = ((1 << FP_BITS) + dq - 1) / dq;
1081           for (int i = 0; i < last; ++i) {
1082             if (h[i]) {
1083               // v = current bin's centroid in the histogram
1084               // qv = quantized value for the bin's representant 'v'
1085               // dqv = dequantized qv, to be compared against v (=> 'error')
1086               // bits = approximate bit-cost of quantized representant
1087               // h[i] = this bin's weight
1088               const int v = (i << HSHIFT) + HHALF;
1089               const int qv = (v * idq + bias) >> FP_BITS;
1090               // TODO(skal): for a given 'last' value, we know the upper limit
1091               // on dq that will make *all* quantized 'qv' values be zero.
1092               // => We can restrict the loop on 'dq' using 'last'.
1093               if (qv) {
1094                 const int bits = CalcLog2(qv);
1095                 const int dqv = qv * dq;
1096                 const int error = (v - dqv) * (v - dqv);
1097                 bsum += h[i] * bits;
1098                 dsum += h[i] * error;
1099               } else {
1100                 dsum += h[i] * v * v;
1101               }
1102             }
1103           }   // end of 'i' loop
1104           distortions[pos][delta] = static_cast<float>(dsum);
1105           sizes[pos][delta] = static_cast<float>(bsum);
1106           const double w = kHistoWeight[delta];   // Gaussian weight
1107           if (w > 0.) {
1108             const double x = static_cast<double>(delta + QDELTA_MIN);
1109             sw   += w;
1110             sx   += w * x;
1111             sxx  += w * x * x;
1112             sy1  += w * dsum;
1113             syy1 += w * dsum * dsum;
1114             sy2  += w * bsum;
1115             sxy1 += w * dsum * x;
1116             sxy2 += w * bsum * x;
1117           }
1118         } else {  // the new quantizer is out-of-range.
1119           distortions[pos][delta] = FLT_MAX;
1120           sizes[pos][delta] = 0;
1121         }
1122       }
1123       // filter channels according to correlation factor.
1124       const double cov_xy1 = sw * sxy1 - sx * sy1;
1125       if (cov_xy1 * cov_xy1 < r_limit *
1126                               (sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
1127         omit_channels |= 1ULL << pos;
1128         continue;
1129       }
1130       // accumulate numerator and denominator for the derivate calculation
1131       num += cov_xy1;
1132       den += sw * sxy2 - sx * sy2;
1133     }
1134 
1135     // we evaluate lambda =~ -d(distortion)/d(size) at dq=0
1136     double lambda = HLAMBDA;
1137     // When increasing Q, size should significantly decrease and distortion
1138     // increase. If they don't, we are ill-conditionned and should fall back
1139     // to a safe value HLAMBDA.
1140     if (num > 1000. && den < -10.) {
1141       // This is our approximation of -d(Distortion) / d(Rate)
1142       // We limit it to 1. below, to avoid degenerated cases
1143       lambda = -num / den;
1144       if (lambda < 1.) {
1145         lambda = 1.;
1146       }
1147     }
1148     // now, optimize each channel using the optimal lambda selection
1149     for (int pos = 0; pos < 64; ++pos) {
1150       if (omit_channels & (1ULL << pos)) {
1151         continue;
1152       }
1153       float best_score = FLT_MAX;
1154       int best_dq = 0;
1155       for (int delta = 0; delta <= delta_max; ++delta) {
1156         if (distortions[pos][delta] < FLT_MAX) {
1157           const float score = distortions[pos][delta]
1158                             + lambda * sizes[pos][delta];
1159           if (score < best_score) {
1160             best_score = score;
1161             best_dq = delta + QDELTA_MIN;
1162           }
1163         }
1164       }
1165       quants_[idx].quant_[pos] += best_dq;
1166       assert(quants_[idx].quant_[pos] >= 1);
1167     }
1168     FinalizeQuantMatrix(&quants_[idx], q_bias_);
1169     SetCostCodes(idx);
1170   }
1171 }
1172 
CollectHistograms()1173 void Encoder::CollectHistograms() {
1174   ResetHisto();
1175   int16_t* in = in_blocks_;
1176   const int mb_x_max = W_ / block_w_;
1177   const int mb_y_max = H_ / block_h_;
1178   for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1179     const bool yclip = (mb_y == mb_y_max);
1180     for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1181       if (!use_extra_memory_) {
1182         in = in_blocks_;
1183       }
1184       GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1185       fDCT_(in, mcu_blocks_);
1186       for (int c = 0; c < nb_comps_; ++c) {
1187         const int num_blocks = nb_blocks_[c];
1188         store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
1189         in += 64 * num_blocks;
1190       }
1191     }
1192   }
1193   have_coeffs_ = use_extra_memory_;
1194 }
1195 
1196 ////////////////////////////////////////////////////////////////////////////////
1197 // Perform YUV conversion and fDCT, and store the unquantized coeffs
1198 
CollectCoeffs()1199 void Encoder::CollectCoeffs() {
1200   assert(use_extra_memory_);
1201   int16_t* in = in_blocks_;
1202   const int mb_x_max = W_ / block_w_;
1203   const int mb_y_max = H_ / block_h_;
1204   for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1205     const bool yclip = (mb_y == mb_y_max);
1206     for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1207       GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1208       fDCT_(in, mcu_blocks_);
1209       in += 64 * mcu_blocks_;
1210     }
1211   }
1212   have_coeffs_ = true;
1213 }
1214 
1215 ////////////////////////////////////////////////////////////////////////////////
1216 // 1-pass Scan
1217 
SinglePassScan()1218 void Encoder::SinglePassScan() {
1219   ResetDCs();
1220 
1221   RunLevel base_run_levels[64];
1222   int16_t* in = in_blocks_;
1223   const int mb_x_max = W_ / block_w_;
1224   const int mb_y_max = H_ / block_h_;
1225   const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1226                                                         : quantize_block_;
1227   for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1228     const bool yclip = (mb_y == mb_y_max);
1229     for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1230       if (!CheckBuffers()) return;
1231       if (!have_coeffs_) {
1232         in = in_blocks_;
1233         GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1234         fDCT_(in, mcu_blocks_);
1235       }
1236       for (int c = 0; c < nb_comps_; ++c) {
1237         DCTCoeffs base_coeffs;
1238         for (int i = 0; i < nb_blocks_[c]; ++i) {
1239           const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1240                                         &base_coeffs, base_run_levels);
1241           base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1242           CodeBlock(&base_coeffs, base_run_levels);
1243           in += 64;
1244         }
1245       }
1246     }
1247   }
1248 }
1249 
FinalPassScan(size_t nb_mbs,const DCTCoeffs * coeffs)1250 void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
1251   DesallocateBlocks();     // we can free up some coeffs memory at this point
1252   if (!CheckBuffers()) return;  // call needed to finalize all_run_levels_
1253   assert(reuse_run_levels_);
1254   const RunLevel* run_levels = all_run_levels_;
1255   for (size_t n = 0; n < nb_mbs; ++n) {
1256     if (!CheckBuffers()) return;
1257     CodeBlock(&coeffs[n], run_levels);
1258     run_levels += coeffs[n].nb_coeffs_;
1259   }
1260 }
1261 
1262 ////////////////////////////////////////////////////////////////////////////////
1263 // Huffman tables optimization
1264 
ResetEntropyStats()1265 void Encoder::ResetEntropyStats() {
1266   memset(freq_ac_, 0, sizeof(freq_ac_));
1267   memset(freq_dc_, 0, sizeof(freq_dc_));
1268 }
1269 
AddEntropyStats(const DCTCoeffs * const coeffs,const RunLevel * const run_levels)1270 void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
1271                               const RunLevel* const run_levels) {
1272   // freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
1273   // resolution allowed is 65535 * 65535. The sum of all frequencies cannot
1274   // be greater than 32bits, either.
1275   const int idx = coeffs->idx_;
1276   const int q_idx = quant_idx_[idx];
1277   for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
1278     const int run = run_levels[i].run_;
1279     const int tmp = (run >> 4);
1280     if (tmp) freq_ac_[q_idx][0xf0] += tmp;  // count escapes (all at once)
1281     const int suffix = run_levels[i].level_;
1282     const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
1283     ++freq_ac_[q_idx][sym];
1284   }
1285   if (coeffs->last_ < 63) {     // EOB
1286     ++freq_ac_[q_idx][0x00];
1287   }
1288   ++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
1289 }
1290 
cmp(const void * pa,const void * pb)1291 static int cmp(const void *pa, const void *pb) {
1292   const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
1293   const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
1294   assert(a != b);  // tie-breaks can't happen
1295   return (a < b) ? 1 : -1;
1296 }
1297 
BuildOptimalTable(HuffmanTable * const t,const uint32_t * const freq,int size)1298 static void BuildOptimalTable(HuffmanTable* const t,
1299                               const uint32_t* const freq, int size) {
1300   enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
1301   assert(size <= 256);
1302   assert(t != nullptr);
1303 
1304   // The celebrated merging algorithm from Huffman, with some restrictions:
1305   // * codes with all '1' are forbidden, to avoid trailing marker emulation
1306   // * code should be less than 16bits. So we're re-allocating them to shorter
1307   //   code, even if it means being suboptimal for extremely rare symbols that
1308   //   would eat a lot of bits.
1309   // This function will not touch the content of freq[].
1310   int codesizes[256 + 1];
1311   // chain[i] will hold the index of the next element in the subtree below
1312   // element 'i', or -1 if there's no sub-tree.
1313   // We use and maintain this list in order to efficiently increasing the
1314   // codesizes by one when merging two sub-trees into one.
1315   // To ease the merging (by avoiding 1 loop) we store the address of the last
1316   // element in the chain for each symbol. This makes the process being O(1).
1317   // It's probably better to keep the arrays separated instead of making
1318   // a struct, since we touch chain_end[] only once per merging, whereas
1319   // chain[] and codesizes[] are modified O(k) time per merging.
1320   int chain[256 + 1];
1321   int* chain_end[256 + 1];
1322   // sorted_freq[] remains sorted by decreasing frequencies along the process.
1323   uint64_t sorted_freq[256 + 1];
1324 
1325   // Counts and puts the symbols effectively used at the beginning of the table.
1326   int nb_syms = 0;
1327   for (int i = 0; i < size; ++i) {
1328     const uint64_t v = freq[i];
1329     if (v > 0) {
1330       // we pack the sorted key (32bits) and index (9bits) into a single
1331       // uint64_t, so we don't have to resort to structs (and we avoid
1332       // tie-breaks, too)
1333       sorted_freq[nb_syms++] = (v << 9) | i;
1334     }
1335     codesizes[i] = 0;
1336     chain[i] = -1;
1337     chain_end[i] = &chain[i];
1338   }
1339   t->nb_syms_ = nb_syms;  // Record how many final symbols we'll have.
1340 
1341   // initial sort
1342   // TODO(skal): replace by counting-sort?? (merged with previous loop?)
1343   qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
1344 
1345   // fake last symbol, with lowest frequency: will be assigned to the forbidden
1346   // code '1111...1', but will eventually be discarded.
1347   sorted_freq[nb_syms++] = (1ULL << 9) | size;
1348   codesizes[size] = 0;
1349   chain[size] = -1;
1350   chain_end[size] = &chain[size];
1351 
1352   // Merging phase
1353   // Recursively merge the two symbols with lowest frequency. The resulting
1354   // super-symbol will be represented by a longer (by 1bit) code, since
1355   // it's the least frequent one.
1356   int nb = nb_syms;
1357   while (nb-- > 1) {
1358     // First, link the two sub-trees.
1359     const uint64_t s1 = sorted_freq[nb - 1];    // first symbol
1360     const uint64_t s2 = sorted_freq[nb];        // second symbol, appended
1361     // The 0x1ff masking is for taking only the symbol, discarding the
1362     // frequency that we stored in the upper bits for sorting.
1363     int i = s1 & 0x1ff;
1364     const int j = s2 & 0x1ff;
1365     assert(i <= size && j <= size);
1366     *chain_end[i] = j;
1367     chain_end[i] = chain_end[j];
1368 
1369     // Then, following the chain, increase the whole sub-tree's weight by 1bit.
1370     do {
1371       ++codesizes[i];
1372       i = chain[i];
1373     } while (i >= 0);
1374 
1375     // Create new symbol, with merged frequencies. Will take s1's spot.
1376     // We must use 64bit here to prevent overflow in the sum. Both s1 and
1377     // s2 are originally 32 + 9 bits wide.
1378     const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
1379     // Perform insertion sort to find the new spot of the merged symbol.
1380     int k = nb - 1;
1381     while (k > 0) {
1382       if (sorted_freq[k - 1] < new_symbol) {
1383         sorted_freq[k] = sorted_freq[k - 1];
1384         --k;
1385       } else {
1386         break;
1387       }
1388     }
1389     sorted_freq[k] = new_symbol;
1390   }
1391 
1392   // Count bit distribution.
1393   uint8_t bits[MAX_BITS];
1394   memset(bits, 0, sizeof(bits));
1395   int max_bit_size = 0;
1396   for (int i = 0; i <= size; ++i) {
1397     int s = codesizes[i];
1398     assert(s <= codesizes[size]);    // symbol #size is the biggest one.
1399     if (s > 0) {
1400       // This is slightly penalizing but only for ultra-rare symbol
1401       if (s > MAX_BITS) {
1402         s = MAX_BITS;
1403         codesizes[i] = MAX_BITS;    // clamp code-size
1404       }
1405       ++bits[s - 1];
1406       if (s > max_bit_size) {
1407         max_bit_size = s;
1408       }
1409     }
1410   }
1411 
1412   // We sort symbols by slices of increasing bitsizes, using counting sort.
1413   // This will generate a partition of symbols in the final syms_[] array.
1414   int start[MAX_BITS];     // start[i] is the first code with length i+1
1415   int position = 0;
1416   for (int i = 0; i < max_bit_size; ++i) {
1417     start[i] = position;
1418     position += bits[i];
1419   }
1420   assert(position == nb_syms);
1421 
1422   // Now, we can ventilate the symbols directly to their final slice in the
1423   // partitioning, according to the their bit-length.
1424   // Note: we omit the last symbol, which is fake.
1425   uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
1426   // Note that we loop til symbol = size-1, hence omitting the last fake symbol.
1427   for (int symbol = 0; symbol < size; ++symbol) {
1428     const int s = codesizes[symbol];
1429     if (s > 0) {
1430       assert(s <= MAX_BITS);
1431       syms[start[s - 1]++] = symbol;
1432     }
1433   }
1434   assert(start[max_bit_size - 1] == nb_syms - 1);
1435 
1436   // Fix codes with length greater than 16 bits. We move too long
1437   // codes up, and one short down, making the tree a little sub-optimal.
1438   for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
1439     while (bits[l] > 0) {
1440       int k = l - 2;
1441       while (bits[k] == 0) {    // Search for a level with a leaf to split.
1442         --k;
1443       }
1444       /* Move up 2 symbols from bottom-most level l, and sink down one from
1445          level k, like this:
1446                     Before:                After:
1447                     /  ..                 /    ..
1448         k bits->   c     \               /\      \
1449                          /\             c  b     /\
1450                        .. /\                   ..  a
1451         l bits->         a  b
1452         Note that by the very construction of the optimal tree, the least
1453         probable symbols always come by pair with same bit-length.
1454         So there's always a pair of 'a' and 'b' to find.
1455       */
1456       bits[l    ] -= 2;     // remove 'a' and 'b'
1457       bits[l - 1] += 1;     // put 'a' one level up.
1458       bits[k    ] -= 1;     // remove 'c'
1459       bits[k + 1] += 2;     // put 'c' anb 'b' one level down.
1460     }
1461   }
1462 
1463   // remove last pseudo-symbol
1464   max_bit_size = MAX_CODE_SIZE;
1465   while (bits[--max_bit_size] == 0) {
1466     assert(max_bit_size > 0);
1467   }
1468   --bits[max_bit_size];
1469 
1470   // update table with final book
1471   for (int i = 0; i < MAX_CODE_SIZE; ++i) {
1472     t->bits_[i] = bits[i];
1473   }
1474 }
1475 
CompileEntropyStats()1476 void Encoder::CompileEntropyStats() {
1477   // plug and build new tables
1478   for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
1479     // DC tables
1480     Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
1481     opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
1482     BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
1483     // AC tables
1484     Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
1485     opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
1486     BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
1487   }
1488 }
1489 
StoreOptimalHuffmanTables(size_t nb_mbs,const DCTCoeffs * coeffs)1490 void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
1491                                         const DCTCoeffs* coeffs) {
1492   // optimize Huffman tables
1493   ResetEntropyStats();
1494   const RunLevel* run_levels = all_run_levels_;
1495   for (size_t n = 0; n < nb_mbs; ++n) {
1496     AddEntropyStats(&coeffs[n], run_levels);
1497     run_levels += coeffs[n].nb_coeffs_;
1498   }
1499   CompileEntropyStats();
1500 }
1501 
1502 ////////////////////////////////////////////////////////////////////////////////
1503 
SinglePassScanOptimized()1504 void Encoder::SinglePassScanOptimized() {
1505   const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
1506   DCTCoeffs* const base_coeffs =
1507       Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
1508   if (base_coeffs == nullptr) return;
1509   DCTCoeffs* coeffs = base_coeffs;
1510   RunLevel base_run_levels[64];
1511   const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1512                                                         : quantize_block_;
1513 
1514   // We use the default Huffman tables as basis for bit-rate evaluation
1515   if (use_trellis_) InitCodes(true);
1516 
1517   ResetEntropyStats();
1518   ResetDCs();
1519   nb_run_levels_ = 0;
1520   int16_t* in = in_blocks_;
1521   const int mb_x_max = W_ / block_w_;
1522   const int mb_y_max = H_ / block_h_;
1523   for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1524     const bool yclip = (mb_y == mb_y_max);
1525     for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1526       if (!have_coeffs_) {
1527         in = in_blocks_;
1528         GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1529         fDCT_(in, mcu_blocks_);
1530       }
1531       if (!CheckBuffers()) goto End;
1532       for (int c = 0; c < nb_comps_; ++c) {
1533         for (int i = 0; i < nb_blocks_[c]; ++i) {
1534           RunLevel* const run_levels =
1535               reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
1536                                 : base_run_levels;
1537           const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1538                                         coeffs, run_levels);
1539           coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1540           AddEntropyStats(coeffs, run_levels);
1541           if (reuse_run_levels_) {
1542             nb_run_levels_ += coeffs->nb_coeffs_;
1543             ++coeffs;
1544             assert(coeffs <= &base_coeffs[nb_mbs]);
1545           }
1546           in += 64;
1547           assert(nb_run_levels_ <= max_run_levels_);
1548         }
1549       }
1550     }
1551   }
1552 
1553   CompileEntropyStats();
1554   WriteDHT();
1555   WriteSOS();
1556 
1557   if (!reuse_run_levels_) {
1558     SinglePassScan();   // redo everything, but with optimal tables now.
1559   } else {
1560     // Re-use the saved run/levels for fast 2nd-pass.
1561     FinalPassScan(nb_mbs, base_coeffs);
1562   }
1563  End:
1564   Free(base_coeffs);
1565 }
1566 
1567 ////////////////////////////////////////////////////////////////////////////////
1568 // main call
1569 
Encode()1570 bool Encoder::Encode() {
1571   if (!ok_) return false;
1572 
1573   FinalizeQuantMatrix(&quants_[0], q_bias_);
1574   FinalizeQuantMatrix(&quants_[1], q_bias_);
1575   SetCostCodes(0);
1576   SetCostCodes(1);
1577 
1578   // default tables
1579   for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
1580 
1581   // colorspace init
1582   InitComponents();
1583   assert(nb_comps_ <= MAX_COMP);
1584   assert(mcu_blocks_ <= 6);
1585   // validate some input parameters
1586   if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
1587 
1588   mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
1589   mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
1590   const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
1591   if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
1592 
1593   WriteAPP0();
1594 
1595   // custom markers written 'as is'
1596   if (!WriteAPPMarkers(app_markers_)) return false;
1597 
1598   // metadata
1599   if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
1600 
1601   if (passes_ > 1) {
1602     LoopScan();
1603   } else {
1604     if (use_adaptive_quant_) {
1605       // Histogram analysis + derive optimal quant matrices
1606       CollectHistograms();
1607       AnalyseHisto();
1608     }
1609 
1610     WriteDQT();
1611     WriteSOF();
1612 
1613     if (optimize_size_) {
1614       SinglePassScanOptimized();
1615     } else {
1616       WriteDHT();
1617       WriteSOS();
1618       SinglePassScan();
1619     }
1620   }
1621   WriteEOI();
1622   ok_ = ok_ && bw_.Finalize();
1623 
1624   DesallocateBlocks();
1625   return ok_;
1626 }
1627 
1628 ////////////////////////////////////////////////////////////////////////////////
1629 // Edge replication
1630 
1631 namespace {
1632 
GetAverage(const int16_t * const out)1633 int GetAverage(const int16_t* const out) {
1634   int DC = 0;
1635   for (int i = 0; i < 64; ++i) DC += out[i];
1636   return (DC + 32) >> 6;
1637 }
1638 
SetAverage(int DC,int16_t * const out)1639 void SetAverage(int DC, int16_t* const out) {
1640   for (int i = 0; i < 64; ++i) out[i] = DC;
1641 }
1642 
1643 }   // anonymous namespace
1644 
AverageExtraLuma(int sub_w,int sub_h,int16_t * out)1645 void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
1646   // out[] points to four 8x8 blocks. When one of this block is totally
1647   // outside of the frame, we set it flat to the average value of the previous
1648   // block ("DC"), in order to help compressibility.
1649   int DC = GetAverage(out);
1650   if (sub_w <= 8) {   // set block #1 to block #0's average value
1651     SetAverage(DC, out + 1 * 64);
1652   }
1653   if (sub_h <= 8) {   // Need to flatten block #2 and #3
1654     if (sub_w > 8) {  // block #1 was not flatten, so get its real DC
1655       DC = GetAverage(out + 1 * 64);
1656     }
1657     SetAverage(DC, out + 2 * 64);
1658     SetAverage(DC, out + 3 * 64);
1659   } else if (sub_w <= 8) {   // set block #3 to the block #2's average value
1660     DC = GetAverage(out + 2 * 64);
1661     SetAverage(DC, out + 3 * 64);
1662   }
1663 }
1664 
GetReplicatedSamples(const uint8_t * rgb,int rgb_step,int sub_w,int sub_h,int w,int h)1665 const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
1666                                              int rgb_step,
1667                                              int sub_w, int sub_h,
1668                                              int w, int h) {
1669   assert(sub_w > 0 && sub_h > 0);
1670   if (sub_w > w) {
1671     sub_w = w;
1672   }
1673   if (sub_h > h) {
1674     sub_h = h;
1675   }
1676   uint8_t* dst = replicated_buffer_;
1677   for (int y = 0; y < sub_h; ++y) {
1678     memcpy(dst, rgb, 3 * sub_w);
1679     const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
1680     for (int x = 3 * sub_w; x < 3 * w; x += 3) {
1681       memcpy(dst + x, src0, 3);
1682     }
1683     dst += 3 * w;
1684     rgb += rgb_step;
1685   }
1686   const uint8_t* dst0 = dst - 3 * w;
1687   for (int y = sub_h; y < h; ++y) {
1688     memcpy(dst, dst0, 3 * w);
1689     dst += 3 * w;
1690   }
1691   return replicated_buffer_;
1692 }
1693 
1694 // TODO(skal): merge with above function? Probably slower...
GetReplicatedYUVSamples(const uint8_t * in,int step,int sub_w,int sub_h,int w,int h)1695 const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
1696                                                 int step,
1697                                                 int sub_w, int sub_h,
1698                                                 int w, int h) {
1699   assert(sub_w > 0 && sub_h > 0);
1700   if (sub_w > w) {
1701     sub_w = w;
1702   }
1703   if (sub_h > h) {
1704     sub_h = h;
1705   }
1706   uint8_t* out = replicated_buffer_;
1707   for (int y = 0; y < sub_h; ++y) {
1708     int x;
1709     for (x = 0; x < sub_w; ++x)
1710       out[x] = in[x];
1711     for (; x < w; ++x) {
1712       out[x] = out[sub_w - 1];
1713     }
1714     out += w;
1715     in += step;
1716   }
1717   const uint8_t* const out0 = out - w;
1718   for (int y = sub_h; y < h; ++y) {
1719     memcpy(out, out0, w);
1720     out += w;
1721   }
1722   return replicated_buffer_;
1723 }
1724 
1725 ////////////////////////////////////////////////////////////////////////////////
1726 // sub-class for YUV 4:2:0 version
1727 
1728 class Encoder420 : public Encoder {
1729  public:
Encoder420(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1730   Encoder420(int W, int H, int step, const uint8_t* const rgb,
1731              ByteSink* const sink)
1732     : Encoder(W, H, step, rgb, sink) {}
~Encoder420()1733   virtual ~Encoder420() {}
InitComponents()1734   virtual void InitComponents() {
1735     nb_comps_ = 3;
1736 
1737     quant_idx_[0] = 0;
1738     quant_idx_[1] = 1;
1739     quant_idx_[2] = 1;
1740 
1741     nb_blocks_[0] = 4;
1742     nb_blocks_[1] = 1;
1743     nb_blocks_[2] = 1;
1744     mcu_blocks_ = 6;
1745 
1746     block_w_ = 16;
1747     block_h_ = 16;
1748     block_dims_[0] = 0x22;
1749     block_dims_[1] = 0x11;
1750     block_dims_[2] = 0x11;
1751   }
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out_blocks)1752   virtual void GetSamples(int mb_x, int mb_y, bool clipped,
1753                           int16_t* out_blocks) {
1754     const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
1755     int step = step_;
1756     if (clipped) {
1757       data = GetReplicatedSamples(data, step,
1758                                   W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1759       step = 3 * 16;
1760     }
1761     get_yuv_block_(data, step, out_blocks);
1762     if (clipped) {
1763       AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
1764     }
1765   }
1766 };
1767 
1768 ////////////////////////////////////////////////////////////////////////////////
1769 // sub-class for YUV 4:4:4 version
1770 
1771 class Encoder444 : public Encoder {
1772  public:
Encoder444(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1773   Encoder444(int W, int H, int step, const uint8_t* const rgb,
1774              ByteSink* const sink)
1775       : Encoder(W, H, step, rgb, sink) {
1776     SetYUVFormat(true);
1777   }
~Encoder444()1778   virtual ~Encoder444() {}
InitComponents()1779   virtual void InitComponents() {
1780     nb_comps_ = 3;
1781 
1782     quant_idx_[0] = 0;
1783     quant_idx_[1] = 1;
1784     quant_idx_[2] = 1;
1785 
1786     nb_blocks_[0] = 1;
1787     nb_blocks_[1] = 1;
1788     nb_blocks_[2] = 1;
1789     mcu_blocks_ = 3;
1790 
1791     block_w_ = 8;
1792     block_h_ = 8;
1793     block_dims_[0] = 0x11;
1794     block_dims_[1] = 0x11;
1795     block_dims_[2] = 0x11;
1796   }
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1797   virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1798     const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
1799     int step = step_;
1800     if (clipped) {
1801       data = GetReplicatedSamples(data, step,
1802                                   W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
1803       step = 3 * 8;
1804     }
1805     get_yuv_block_(data, step, out);
1806   }
1807 };
1808 
1809 ////////////////////////////////////////////////////////////////////////////////
1810 // sub-class for the sharp YUV 4:2:0 version
1811 
1812 class EncoderSharp420 : public Encoder420 {
1813  public:
EncoderSharp420(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1814   EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
1815                   ByteSink* const sink)
1816       : Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
1817     const int uv_w = (W + 1) >> 1;
1818     const int uv_h = (H + 1) >> 1;
1819     yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
1820     if (yuv_memory_ == nullptr) return;
1821     y_plane_ = yuv_memory_;
1822     y_step_ = W;
1823     u_plane_ = yuv_memory_ + W * H;
1824     v_plane_ = u_plane_ + uv_w * uv_h;
1825     uv_step_ = uv_w;
1826     ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
1827   }
~EncoderSharp420()1828   virtual ~EncoderSharp420() { Free(yuv_memory_); }
1829   virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
1830 
1831  protected:
GetLumaSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1832   void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1833     int step = y_step_;
1834     const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
1835     if (clipped) {
1836       Y1 = GetReplicatedYUVSamples(Y1, step,
1837                                    W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1838       step = 16;
1839     }
1840     const uint8_t* Y2 = Y1 + 8 * step;
1841     for (int y = 8, n = 0; y > 0; --y) {
1842       for (int x = 0; x < 8; ++x, ++n) {
1843         out[n + 0 * 64] = Y1[x] - 128;
1844         out[n + 1 * 64] = Y1[x + 8] - 128;
1845         out[n + 2 * 64] = Y2[x] - 128;
1846         out[n + 3 * 64] = Y2[x + 8] - 128;
1847       }
1848       Y1 += step;
1849       Y2 += step;
1850     }
1851     if (clipped) {
1852       AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
1853     }
1854   }
1855 
1856  private:
1857   uint8_t* y_plane_;
1858   int y_step_;
1859   uint8_t* u_plane_;
1860   uint8_t* v_plane_;
1861   int uv_step_;
1862   uint8_t* yuv_memory_;
1863 };
1864 
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1865 void EncoderSharp420::GetSamples(int mb_x, int mb_y,
1866                                  bool clipped, int16_t* out) {
1867   GetLumaSamples(mb_x, mb_y, clipped, out);
1868 
1869   // Chroma
1870   const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
1871   int step = uv_step_;
1872   if (clipped) {
1873     U = GetReplicatedYUVSamples(U, step,
1874                                 ((W_ + 1) >> 1) - mb_x * 8,
1875                                 ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1876     step = 8;
1877   }
1878   for (int y = 8, n = 0; y > 0; --y, U += step) {
1879     for (int x = 0; x < 8; ++x, ++n) {
1880       out[n + 4 * 64] = U[x] - 128;
1881     }
1882   }
1883   const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
1884   step = uv_step_;
1885   if (clipped) {
1886     V = GetReplicatedYUVSamples(V, step,
1887                                 ((W_ + 1) >> 1) - mb_x * 8,
1888                                 ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1889     step = 8;
1890   }
1891   for (int y = 8, n = 0; y > 0; --y, V += step) {
1892     for (int x = 0; x < 8; ++x, ++n) {
1893       out[n + 5 * 64] = V[x] - 128;
1894     }
1895   }
1896 }
1897 
1898 ////////////////////////////////////////////////////////////////////////////////
1899 // all-in-one factory to pickup the right encoder instance
1900 
EncoderFactory(const uint8_t * rgb,int W,int H,int stride,SjpegYUVMode yuv_mode,ByteSink * const sink)1901 Encoder* EncoderFactory(const uint8_t* rgb,
1902                              int W, int H, int stride, SjpegYUVMode yuv_mode,
1903                              ByteSink* const sink) {
1904   if (yuv_mode == SJPEG_YUV_AUTO) {
1905     yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
1906   }
1907 
1908   Encoder* enc = nullptr;
1909   if (yuv_mode == SJPEG_YUV_420) {
1910     enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
1911   } else if (yuv_mode == SJPEG_YUV_SHARP) {
1912     enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
1913   } else {
1914     enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
1915   }
1916   if (enc == nullptr || !enc->Ok()) {
1917     delete enc;
1918     enc = nullptr;
1919   }
1920   return enc;
1921 }
1922 
1923 }    // namespace sjpeg
1924 
1925 ////////////////////////////////////////////////////////////////////////////////
1926 // public plain-C functions
1927 
SjpegEncode(const uint8_t * rgb,int width,int height,int stride,uint8_t ** out_data,float quality,int method,SjpegYUVMode yuv_mode)1928 size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
1929                    uint8_t** out_data, float quality, int method,
1930                    SjpegYUVMode yuv_mode) {
1931   if (rgb == nullptr || out_data == nullptr) return 0;
1932   if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
1933   *out_data = nullptr;  // safety
1934 
1935   MemorySink sink(width * height / 4);
1936   Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
1937                                       &sink);
1938   enc->SetQuality(quality);
1939   enc->SetCompressionMethod(method);
1940   size_t size = 0;
1941   *out_data = nullptr;
1942   if (enc->Encode()) sink.Release(out_data, &size);
1943   delete enc;
1944   return size;
1945 }
1946 
1947 ////////////////////////////////////////////////////////////////////////////////
1948 
SjpegCompress(const uint8_t * rgb,int width,int height,float quality,uint8_t ** out_data)1949 size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
1950                      uint8_t** out_data) {
1951   return SjpegEncode(rgb, width, height, 3 * width, out_data,
1952                      quality, 4, SJPEG_YUV_AUTO);
1953 }
1954 
SjpegFreeBuffer(const uint8_t * buffer)1955 void SjpegFreeBuffer(const uint8_t* buffer) {
1956   delete[] buffer;
1957 }
1958 
1959 ////////////////////////////////////////////////////////////////////////////////
1960 
SjpegVersion()1961 uint32_t SjpegVersion() {
1962   return SJPEG_VERSION;
1963 }
1964 
1965 ////////////////////////////////////////////////////////////////////////////////
1966 // Parametrized call
1967 
EncoderParam()1968 EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
1969   Init(kDefaultQuality);
1970 }
1971 
EncoderParam(float quality_factor)1972 EncoderParam::EncoderParam(float quality_factor)
1973     : search_hook(nullptr), memory(nullptr) {
1974   Init(quality_factor);
1975 }
1976 
Init(float quality_factor)1977 void EncoderParam::Init(float quality_factor) {
1978   Huffman_compress = true;
1979   adaptive_quantization = true;
1980   use_trellis = false;
1981   yuv_mode = SJPEG_YUV_AUTO;
1982   quantization_bias = kDefaultBias;
1983   qdelta_max_luma = kDefaultDeltaMaxLuma;
1984   qdelta_max_chroma = kDefaultDeltaMaxChroma;
1985   adaptive_bias = false;
1986   SetLimitQuantization(false);
1987   min_quant_tolerance_ = 0;
1988   SetQuality(quality_factor);
1989   target_mode = TARGET_NONE;
1990   target_value = 0;
1991   passes = 1;
1992   tolerance = 1.;
1993   qmin = 0.;
1994   qmax = 100.;
1995 }
1996 
SetQuality(float quality_factor)1997 void EncoderParam::SetQuality(float quality_factor) {
1998   const float q = GetQFactor(quality_factor);
1999   sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
2000   sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
2001 }
2002 
SetQuantization(const uint8_t m[2][64],float reduction)2003 void EncoderParam::SetQuantization(const uint8_t m[2][64],
2004                                        float reduction) {
2005   if (reduction <= 1.f) reduction = 1.f;
2006   if (m == nullptr) return;
2007   for (int c = 0; c < 2; ++c) {
2008     for (size_t i = 0; i < 64; ++i) {
2009       const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
2010       quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
2011     }
2012   }
2013 }
2014 
SetLimitQuantization(bool limit_quantization,int min_quant_tolerance)2015 void EncoderParam::SetLimitQuantization(bool limit_quantization,
2016                                             int min_quant_tolerance) {
2017   use_min_quant_ = limit_quantization;
2018   if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
2019 }
2020 
SetMinQuantization(const uint8_t m[2][64],int min_quant_tolerance)2021 void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
2022                                           int min_quant_tolerance) {
2023   use_min_quant_ = true;
2024   CopyQuantMatrix(m[0], min_quant_[0]);
2025   CopyQuantMatrix(m[1], min_quant_[1]);
2026   min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
2027                        : (min_quant_tolerance > 100) ? 100
2028                        : min_quant_tolerance;
2029 }
2030 
ResetMetadata()2031 void EncoderParam::ResetMetadata() {
2032   iccp.clear();
2033   exif.clear();
2034   xmp.clear();
2035   app_markers.clear();
2036 }
2037 
InitFromParam(const EncoderParam & param)2038 bool Encoder::InitFromParam(const EncoderParam& param) {
2039   SetQuantMatrices(param.quant_);
2040   if (param.use_min_quant_) {
2041     SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
2042   } else {
2043     SetDefaultMinQuantMatrices();
2044   }
2045 
2046   int method = param.Huffman_compress ? 1 : 0;
2047   if (param.adaptive_quantization) method += 3;
2048   if (param.use_trellis) {
2049     method = (method == 4) ? 7 : (method == 6) ? 8 : method;
2050   }
2051 
2052   SetCompressionMethod(method);
2053   SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
2054   SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
2055 
2056   SetMetadata(param.iccp, Encoder::ICC);
2057   SetMetadata(param.exif, Encoder::EXIF);
2058   SetMetadata(param.xmp, Encoder::XMP);
2059   SetMetadata(param.app_markers, Encoder::MARKERS);
2060 
2061   passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
2062   if (passes_ > 1) {
2063     use_extra_memory_ = true;
2064     reuse_run_levels_ = true;
2065     search_hook_ = (param.search_hook == nullptr) ? &default_hook_
2066                                                   : param.search_hook;
2067     if (!search_hook_->Setup(param)) return false;
2068   }
2069 
2070   memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
2071   return true;
2072 }
2073 
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,ByteSink * sink)2074 bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2075                    const EncoderParam& param, ByteSink* sink) {
2076   if (rgb == nullptr || sink == nullptr) return false;
2077   if (width <= 0 || height <= 0 || stride < 3 * width) return false;
2078 
2079   Encoder* const enc = EncoderFactory(rgb, width, height, stride,
2080                                       param.yuv_mode, sink);
2081   const bool ok = (enc != nullptr) &&
2082                   enc->InitFromParam(param) &&
2083                   enc->Encode();
2084   delete enc;
2085   return ok;
2086 }
2087 
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,uint8_t ** out_data)2088 size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2089                      const EncoderParam& param, uint8_t** out_data) {
2090   MemorySink sink(width * height / 4);    // estimation of output size
2091   if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
2092   size_t size;
2093   sink.Release(out_data, &size);
2094   return size;
2095 }
2096 
2097 ////////////////////////////////////////////////////////////////////////////////
2098 // std::string variants
2099 
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,std::string * output)2100 bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2101                    const EncoderParam& param, std::string* output) {
2102   if (output == nullptr) return false;
2103   output->clear();
2104   output->reserve(width * height / 4);
2105   StringSink sink(output);
2106   return Encode(rgb, width, height, stride, param, &sink);
2107 }
2108 
SjpegCompress(const uint8_t * rgb,int width,int height,float quality,std::string * output)2109 bool SjpegCompress(const uint8_t* rgb, int width, int height,
2110                    float quality, std::string* output) {
2111   EncoderParam param;
2112   param.SetQuality(quality);
2113   return Encode(rgb, width, height, 3 * width, param, output);
2114 }
2115 
2116 ////////////////////////////////////////////////////////////////////////////////
2117 
SjpegDimensions(const std::string & jpeg_data,int * width,int * height,int * is_yuv420)2118 bool SjpegDimensions(const std::string& jpeg_data,
2119                      int* width, int* height, int* is_yuv420) {
2120   return SjpegDimensions(
2121       reinterpret_cast<const uint8_t*>(jpeg_data.data()),
2122       jpeg_data.size(), width, height, is_yuv420);
2123 }
2124 
SjpegFindQuantizer(const std::string & jpeg_data,uint8_t quant[2][64])2125 int SjpegFindQuantizer(const std::string& jpeg_data,
2126                        uint8_t quant[2][64]) {
2127   return SjpegFindQuantizer(
2128       reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
2129       quant);
2130 }
2131 
2132 ////////////////////////////////////////////////////////////////////////////////
2133