1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Fast and simple JPEG encoder
16 //
17 // Author: Skal (pascal.massimino@gmail.com)
18
19 #include <stdlib.h>
20 #include <math.h>
21 #include <float.h> // for FLT_MAX
22 #include <stdint.h>
23
24 #define SJPEG_NEED_ASM_HEADERS
25 #include "sjpegi.h"
26
27 using namespace sjpeg;
28
29 // Some general default values:
30 static const float kDefaultQuality = 75.f;
31 static const int kDefaultMethod = 4;
32 // Rounding bias for AC coefficients, as 8bit fixed point.
33 // A default value 0x78 leans toward filesize reduction.
34 static const int32_t kDefaultBias = 0x78;
35 // for adaptive quantization:
36 static const int kDefaultDeltaMaxLuma = 12;
37 static const int kDefaultDeltaMaxChroma = 1;
38
39 // finer tuning of perceptual optimizations:
40
41 // Minimum average number of entries per bin required for performing histogram-
42 // -based optimization. Below this limit, the channel's histogram is declared
43 // under-populated and the corresponding optimization skipped.
44 static double kDensityThreshold = 0.5;
45 // Rejection limit on the correlation factor when extrapolating the distortion
46 // from histograms. If the least-square fit has a squared correlation factor
47 // less than this threshold, the corresponding quantization scale will be
48 // kept unchanged.
49 static double kCorrelationThreshold = 0.5;
50 // Bit-map of channels to omit during quantization matrix optimization.
51 // If the bit 'i + 8 * j' is set in this bit field, the matrix entry at
52 // position (i,j) will be kept unchanged during optimization.
53 // The default value is 0x103 = 1 + 2 + 256: the 3 entries in the top-left
54 // corner (with lowest-frequency) are not optimized, since it can lead to
55 // visual degradation of smooth gradients.
56 static const uint64_t kOmittedChannels = 0x0000000000000103ULL;
57
58 ////////////////////////////////////////////////////////////////////////////////
59
60 namespace sjpeg {
61
62 const uint8_t kZigzag[64] = {
63 0, 1, 8, 16, 9, 2, 3, 10,
64 17, 24, 32, 25, 18, 11, 4, 5,
65 12, 19, 26, 33, 40, 48, 41, 34,
66 27, 20, 13, 6, 7, 14, 21, 28,
67 35, 42, 49, 56, 57, 50, 43, 36,
68 29, 22, 15, 23, 30, 37, 44, 51,
69 58, 59, 52, 45, 38, 31, 39, 46,
70 53, 60, 61, 54, 47, 55, 62, 63,
71 };
72
73 const uint8_t kDefaultMatrices[2][64] = {
74 // these are the default luma/chroma matrices (JPEG spec section K.1)
75 { 16, 11, 10, 16, 24, 40, 51, 61,
76 12, 12, 14, 19, 26, 58, 60, 55,
77 14, 13, 16, 24, 40, 57, 69, 56,
78 14, 17, 22, 29, 51, 87, 80, 62,
79 18, 22, 37, 56, 68, 109, 103, 77,
80 24, 35, 55, 64, 81, 104, 113, 92,
81 49, 64, 78, 87, 103, 121, 120, 101,
82 72, 92, 95, 98, 112, 100, 103, 99 },
83 { 17, 18, 24, 47, 99, 99, 99, 99,
84 18, 21, 26, 66, 99, 99, 99, 99,
85 24, 26, 56, 99, 99, 99, 99, 99,
86 47, 66, 99, 99, 99, 99, 99, 99,
87 99, 99, 99, 99, 99, 99, 99, 99,
88 99, 99, 99, 99, 99, 99, 99, 99,
89 99, 99, 99, 99, 99, 99, 99, 99,
90 99, 99, 99, 99, 99, 99, 99, 99 }
91 };
92
GetQFactor(float q)93 float GetQFactor(float q) {
94 // we use the same mapping than jpeg-6b, for coherency
95 q = (q <= 0) ? 5000 : (q < 50) ? 5000 / q : (q < 100) ? 2 * (100 - q) : 0;
96 // We floor-round to integer here just to preserve compatibility with jpeg6b.
97 return floorf(q);
98 }
99
CopyQuantMatrix(const uint8_t in[64],uint8_t out[64])100 void CopyQuantMatrix(const uint8_t in[64], uint8_t out[64]) {
101 memcpy(out, in, 64 * sizeof(out[0]));
102 }
103
SetQuantMatrix(const uint8_t in[64],float q_factor,uint8_t out[64])104 void SetQuantMatrix(const uint8_t in[64], float q_factor, uint8_t out[64]) {
105 if (in == nullptr || out == nullptr) return;
106 q_factor /= 100.f;
107 for (size_t i = 0; i < 64; ++i) {
108 const int v = static_cast<int>(in[i] * q_factor + .5f);
109 // clamp to prevent illegal quantizer values
110 out[i] = (v < 1) ? 1 : (v > 255) ? 255u : v;
111 }
112 }
113
SetMinQuantMatrix(const uint8_t m[64],uint8_t out[64],int tolerance)114 void SetMinQuantMatrix(const uint8_t m[64], uint8_t out[64], int tolerance) {
115 assert(out != nullptr && m != nullptr);
116 for (size_t i = 0; i < 64; ++i) {
117 const int v = static_cast<int>(m[i] * (256 - tolerance) >> 8);
118 out[i] = (v < 1) ? 1u : (v > 255) ? 255u : v;
119 }
120 }
121
SetDefaultMinQuantMatrix(uint8_t out[64])122 void SetDefaultMinQuantMatrix(uint8_t out[64]) {
123 assert(out != nullptr);
124 for (size_t i = 0; i < 64; ++i) out[i] = 1u;
125 }
126
127 ////////////////////////////////////////////////////////////////////////////////
128 // Default memory manager (singleton)
129
130 static struct DefaultMemory : public MemoryManager {
131 public:
~DefaultMemorysjpeg::DefaultMemory132 virtual ~DefaultMemory() {}
Allocsjpeg::DefaultMemory133 virtual void* Alloc(size_t size) { return malloc(size); }
Freesjpeg::DefaultMemory134 virtual void Free(void* const ptr) { free(ptr); }
135 } kDefaultMemory;
136
137 ////////////////////////////////////////////////////////////////////////////////
138 // Encoder main class
139
Encoder(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)140 Encoder::Encoder(int W, int H, int step, const uint8_t* const rgb,
141 ByteSink* const sink)
142 : W_(W), H_(H), step_(step),
143 rgb_(rgb),
144 ok_(true),
145 bw_(sink),
146 in_blocks_base_(nullptr),
147 in_blocks_(nullptr),
148 have_coeffs_(false),
149 all_run_levels_(nullptr),
150 nb_run_levels_(0),
151 max_run_levels_(0),
152 qdelta_max_luma_(kDefaultDeltaMaxLuma),
153 qdelta_max_chroma_(kDefaultDeltaMaxChroma),
154 passes_(1),
155 search_hook_(nullptr),
156 memory_hook_(&kDefaultMemory) {
157 SetCompressionMethod(kDefaultMethod);
158 SetQuality(kDefaultQuality);
159 SetYUVFormat(false);
160 SetQuantizationBias(kDefaultBias, false);
161 SetDefaultMinQuantMatrices();
162 InitializeStaticPointers();
163 memset(dc_codes_, 0, sizeof(dc_codes_)); // safety
164 memset(ac_codes_, 0, sizeof(ac_codes_));
165 }
166
~Encoder()167 Encoder::~Encoder() {
168 Free(all_run_levels_);
169 DesallocateBlocks(); // clean-up leftovers in case of we had an error
170 }
171
172 ////////////////////////////////////////////////////////////////////////////////
173
SetQuality(float q)174 void Encoder::SetQuality(float q) {
175 q = GetQFactor(q);
176 SetQuantMatrix(kDefaultMatrices[0], q, quants_[0].quant_);
177 SetQuantMatrix(kDefaultMatrices[1], q, quants_[1].quant_);
178 }
179
SetQuantMatrices(const uint8_t m[2][64])180 void Encoder::SetQuantMatrices(const uint8_t m[2][64]) {
181 SetQuantMatrix(m[0], 100, quants_[0].quant_);
182 SetQuantMatrix(m[1], 100, quants_[1].quant_);
183 }
184
SetMinQuantMatrices(const uint8_t m[2][64],int tolerance)185 void Encoder::SetMinQuantMatrices(const uint8_t m[2][64], int tolerance) {
186 SetMinQuantMatrix(m[0], quants_[0].min_quant_, tolerance);
187 SetMinQuantMatrix(m[1], quants_[1].min_quant_, tolerance);
188 }
189
SetDefaultMinQuantMatrices()190 void Encoder::SetDefaultMinQuantMatrices() {
191 SetDefaultMinQuantMatrix(quants_[0].min_quant_);
192 SetDefaultMinQuantMatrix(quants_[1].min_quant_);
193 }
194
SetCompressionMethod(int method)195 void Encoder::SetCompressionMethod(int method) {
196 assert(method >= 0 && method <= 8);
197 use_adaptive_quant_ = (method >= 3);
198 optimize_size_ = (method != 0) && (method != 3);
199 use_extra_memory_ = (method == 3) || (method == 4) || (method == 7);
200 reuse_run_levels_ = (method == 1) || (method == 4) || (method == 5)
201 || (method >= 7);
202 use_trellis_ = (method >= 7);
203 }
204
SetMetadata(const std::string & data,MetadataType type)205 void Encoder::SetMetadata(const std::string& data, MetadataType type) {
206 switch (type) {
207 case ICC: iccp_ = data; break;
208 case EXIF: exif_ = data; break;
209 case XMP: xmp_ = data; break;
210 default:
211 case MARKERS: app_markers_ = data; break;
212 }
213 }
214
SetQuantizationBias(int bias,bool use_adaptive)215 void Encoder::SetQuantizationBias(int bias, bool use_adaptive) {
216 assert(bias >= 0 && bias <= 255);
217 q_bias_ = bias;
218 adaptive_bias_ = use_adaptive;
219 }
220
SetQuantizationDeltas(int qdelta_luma,int qdelta_chroma)221 void Encoder::SetQuantizationDeltas(int qdelta_luma, int qdelta_chroma) {
222 assert(qdelta_luma >= 0 && qdelta_luma <= 255);
223 assert(qdelta_chroma >= 0 && qdelta_chroma <= 255);
224 qdelta_max_luma_ = qdelta_luma;
225 qdelta_max_chroma_ = qdelta_chroma;
226 }
227
228 ////////////////////////////////////////////////////////////////////////////////
229 // CPU support
230
231 extern bool ForceSlowCImplementation;
232 bool ForceSlowCImplementation = false; // undocumented! for tests.
233
SupportsSSE2()234 bool SupportsSSE2() {
235 if (ForceSlowCImplementation) return false;
236 #if defined(SJPEG_USE_SSE2)
237 return true;
238 #endif
239 return false;
240 }
241
SupportsNEON()242 bool SupportsNEON() {
243 if (ForceSlowCImplementation) return false;
244 #if defined(SJPEG_USE_NEON)
245 return true;
246 #endif
247 return false;
248 }
249
250 ////////////////////////////////////////////////////////////////////////////////
251 // static pointers to architecture-dependant implementation
252
253 Encoder::QuantizeErrorFunc Encoder::quantize_error_ = nullptr;
254 Encoder::QuantizeBlockFunc Encoder::quantize_block_ = nullptr;
255 void (*Encoder::fDCT_)(int16_t* in, int num_blocks) = nullptr;
256 Encoder::StoreHistoFunc Encoder::store_histo_ = nullptr;
257 RGBToYUVBlockFunc Encoder::get_yuv444_block_ = nullptr;
258
InitializeStaticPointers()259 void Encoder::InitializeStaticPointers() {
260 if (fDCT_ == nullptr) {
261 store_histo_ = GetStoreHistoFunc();
262 quantize_block_ = GetQuantizeBlockFunc();
263 quantize_error_ = GetQuantizeErrorFunc();
264 fDCT_ = GetFdct();
265 get_yuv444_block_ = GetBlockFunc(true);
266 }
267 }
268
269 ////////////////////////////////////////////////////////////////////////////////
270 // memory and internal buffers management. We grow on demand.
271
SetError()272 bool Encoder::SetError() {
273 ok_ = false;
274 return false;
275 }
276
CheckBuffers()277 bool Encoder::CheckBuffers() {
278 // maximum macroblock size, worst-case, is 24bits*64*6 coeffs = 1152bytes
279 ok_ = ok_ && bw_.Reserve(2048);
280 if (!ok_) return false;
281
282 if (reuse_run_levels_) {
283 if (nb_run_levels_ + 6*64 > max_run_levels_) {
284 // need to grow storage for run/levels
285 const size_t new_size = max_run_levels_ ? max_run_levels_ * 2 : 8192;
286 RunLevel* const new_rl = Alloc<RunLevel>(new_size);
287 if (new_rl == nullptr) return false;
288 if (nb_run_levels_ > 0) {
289 memcpy(new_rl, all_run_levels_,
290 nb_run_levels_ * sizeof(new_rl[0]));
291 }
292 Free(all_run_levels_);
293 all_run_levels_ = new_rl;
294 max_run_levels_ = new_size;
295 assert(nb_run_levels_ + 6 * 64 <= max_run_levels_);
296 }
297 }
298 return true;
299 }
300
AllocateBlocks(size_t num_blocks)301 bool Encoder::AllocateBlocks(size_t num_blocks) {
302 assert(in_blocks_ == nullptr);
303 have_coeffs_ = false;
304 const size_t size = num_blocks * 64 * sizeof(*in_blocks_);
305 in_blocks_base_ = Alloc<uint8_t>(size + ALIGN_CST);
306 if (in_blocks_base_ == nullptr) return false;
307 in_blocks_ = reinterpret_cast<int16_t*>(
308 (ALIGN_CST + reinterpret_cast<uintptr_t>(in_blocks_base_)) & ~ALIGN_CST);
309 return true;
310 }
311
DesallocateBlocks()312 void Encoder::DesallocateBlocks() {
313 Free(in_blocks_base_);
314 in_blocks_base_ = nullptr;
315 in_blocks_ = nullptr; // sanity
316 }
317
318 ////////////////////////////////////////////////////////////////////////////////
319
320 #define FP_BITS 16 // fractional precision for fixed-point dividors
321 #define AC_BITS 4 // extra precision bits from fdct's scaling
322 #define BIAS_DC 0x80 // neutral bias for DC (mandatory!)
323
324 // divide-by-multiply helper macros
325 #define MAKE_INV_QUANT(Q) (((1u << FP_BITS) + (Q) / 2) / (Q))
326 #define DIV_BY_MULT(A, M) (((A) * (M)) >> FP_BITS)
327 #define QUANTIZE(A, M, B) (DIV_BY_MULT((A) + (B), (M)) >> AC_BITS)
328
FinalizeQuantMatrix(Quantizer * const q,int q_bias)329 void Encoder::FinalizeQuantMatrix(Quantizer* const q, int q_bias) {
330 // first, clamp the quant matrix:
331 for (size_t i = 0; i < 64; ++i) {
332 if (q->quant_[i] < q->min_quant_[i]) q->quant_[i] = q->min_quant_[i];
333 }
334 // Special case! for v=1 we can't represent the multiplier with 16b precision.
335 // So, instead we max out the multiplier to 0xffffu, and twist the bias to the
336 // value 0x80. The overall precision isn't affected: it's bit-exact the same
337 // for our working range.
338 // Note that quant=1 can start appearing at quality as low as 93.
339 const uint16_t bias_1 = 0x80;
340 const uint16_t iquant_1 = 0xffffu;
341 for (size_t i = 0; i < 64; ++i) {
342 const uint16_t v = q->quant_[i];
343 const uint16_t iquant = (v == 1) ? iquant_1 : MAKE_INV_QUANT(v);
344 const uint16_t bias = (v == 1) ? bias_1 : (i == 0) ? BIAS_DC : q_bias;
345 const uint16_t ibias = (((bias * v) << AC_BITS) + 128) >> 8;
346 const uint16_t qthresh =
347 ((1 << (FP_BITS + AC_BITS)) + iquant - 1) / iquant - ibias;
348 q->bias_[i] = ibias;
349 q->iquant_[i] = iquant;
350 q->qthresh_[i] = qthresh;
351 assert(QUANTIZE(qthresh, iquant, ibias) > 0);
352 assert(QUANTIZE(qthresh - 1, iquant, ibias) == 0);
353 }
354 }
355
SetCostCodes(int idx)356 void Encoder::SetCostCodes(int idx) {
357 quants_[idx].codes_ = ac_codes_[idx];
358 }
359
360 ////////////////////////////////////////////////////////////////////////////////
361 // standard Huffman tables, as per JPEG standard section K.3.
362
363 static const uint8_t kDCSyms[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
364 static const uint8_t kACSyms[2][162] = {
365 { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
366 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
367 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
368 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
369 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
370 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
371 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
372 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
373 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
374 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
375 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
376 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
377 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
378 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
379 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
380 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
381 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
382 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
383 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
384 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
385 0xf9, 0xfa },
386 { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
387 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
388 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
389 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
390 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
391 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
392 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
393 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
394 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
395 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
396 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
397 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
398 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
399 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
400 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
401 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
402 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
403 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
404 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
405 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
406 0xf9, 0xfa }
407 };
408
409 static const HuffmanTable kHuffmanTables[4] = {
410 { { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
411 { { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, kDCSyms, 12 },
412 { { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125 }, kACSyms[0], 162 },
413 { { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119 }, kACSyms[1], 162 }
414 };
415
416 ////////////////////////////////////////////////////////////////////////////////
417 // This function generates a map from symbols to code + len stored in a packed
418 // way (lower 16bit is the lenth, upper 16bit is the VLC).
419 // The input is a JPEG-like description of the symbols:
420 // - bits[i] stores the number of codes having length i + 1.
421 // - symbols[] contain the symbols' map, in increasing bit-length order.
422 // There is no check performed on the validity symbols[]'s content.
423 // The values of tab[] not referring to an actual symbol will remain unchanged.
424 // Returns the number of symbols used (that is: sum{bits[i]})
425
BuildHuffmanTable(const uint8_t bits[16],const uint8_t * symbols,uint32_t * const tab)426 static int BuildHuffmanTable(const uint8_t bits[16], const uint8_t* symbols,
427 uint32_t* const tab) {
428 uint32_t code = 0;
429 int nb = 0;
430 for (int nb_bits = 1; nb_bits <= 16; ++nb_bits, code <<= 1) {
431 int n = bits[nb_bits - 1]; // number of code for that given nb_bits
432 nb += n;
433 while (n-- > 0) {
434 const int symbol = *symbols++;
435 tab[symbol] = (code << 16) | nb_bits;
436 ++code;
437 }
438 }
439 return nb;
440 }
441
442 ////////////////////////////////////////////////////////////////////////////////
443
InitCodes(bool only_ac)444 void Encoder::InitCodes(bool only_ac) {
445 const int nb_tables = (nb_comps_ == 1 ? 1 : 2);
446 for (int c = 0; c < nb_tables; ++c) { // luma, chroma
447 for (int type = (only_ac ? 1 : 0); type <= 1; ++type) {
448 const HuffmanTable* const h = Huffman_tables_[type * 2 + c];
449 const int nb_syms = BuildHuffmanTable(h->bits_, h->syms_,
450 type == 1 ? ac_codes_[c]
451 : dc_codes_[c]);
452 assert(nb_syms == h->nb_syms_);
453 (void)nb_syms;
454 }
455 }
456 }
457
458 ////////////////////////////////////////////////////////////////////////////////
459 // Quantize coefficients and pseudo-code coefficients
460
CalcLog2(int v)461 static int CalcLog2(int v) {
462 #if defined(__GNUC__) && \
463 ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
464 return 32 - __builtin_clz(v);
465 #else
466 const int kLog2[16] = {
467 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4 };
468 assert(v > 0 && v < (1 << 12));
469 return (v & ~0xff) ? 8 + kLog2[v >> 8] :
470 (v & ~0x0f) ? 4 + kLog2[v >> 4] :
471 0 + kLog2[v];
472 #endif
473 }
474
GenerateDCDiffCode(int DC,int * const DC_predictor)475 uint16_t Encoder::GenerateDCDiffCode(int DC, int* const DC_predictor) {
476 const int diff = DC - *DC_predictor;
477 *DC_predictor = DC;
478 if (diff == 0) {
479 return 0;
480 }
481 int suff, n;
482 if (diff < 0) {
483 n = CalcLog2(-diff);
484 suff = (diff - 1) & ((1 << n) - 1);
485 } else {
486 n = CalcLog2(diff);
487 suff = diff;
488 }
489 assert((suff & 0xf000) == 0);
490 assert(n < 12);
491 return n | (suff << 4);
492 }
493
494 ////////////////////////////////////////////////////////////////////////////////
495 // various implementation of histogram collection
496
497 #if defined(SJPEG_USE_SSE2)
498 // Load eight 16b-words from *src.
499 #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
500 // Store eight 16b-words into *dst
501 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
502
QuantizeBlockSSE2(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)503 static int QuantizeBlockSSE2(const int16_t in[64], int idx,
504 const Quantizer* const Q,
505 DCTCoeffs* const out, RunLevel* const rl) {
506 const uint16_t* const bias = Q->bias_;
507 const uint16_t* const iquant = Q->iquant_;
508 int prev = 1;
509 int nb = 0;
510 int16_t tmp[64], masked[64];
511 for (int i = 0; i < 64; i += 8) {
512 const __m128i m_bias = LOAD_16(bias + i);
513 const __m128i m_mult = LOAD_16(iquant + i);
514 const __m128i A = LOAD_16(in + i); // A = in[i]
515 const __m128i B = _mm_srai_epi16(A, 15); // sign extract
516 const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
517 const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v + bias
518 const __m128i E = _mm_mulhi_epu16(D, m_mult); // (v' * iq) >> 16
519 const __m128i F = _mm_srli_epi16(E, AC_BITS); // = QUANTIZE(...)
520 const __m128i G = _mm_xor_si128(F, B); // v ^ mask
521 STORE_16(F, tmp + i);
522 STORE_16(G, masked + i);
523 }
524 for (int i = 1; i < 64; ++i) {
525 const int j = kZigzag[i];
526 const int v = tmp[j];
527 if (v > 0) {
528 const int n = CalcLog2(v);
529 const uint16_t code = masked[j] & ((1 << n) - 1);
530 rl[nb].level_ = (code << 4) | n;
531 rl[nb].run_ = i - prev;
532 prev = i + 1;
533 ++nb;
534 }
535 }
536 const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
537 out->idx_ = idx;
538 out->last_ = prev - 1;
539 out->nb_coeffs_ = nb;
540 return dc;
541 }
542 #undef LOAD_16
543 #undef STORE_16
544
545 #elif defined(SJPEG_USE_NEON)
QuantizeBlockNEON(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)546 static int QuantizeBlockNEON(const int16_t in[64], int idx,
547 const Quantizer* const Q,
548 DCTCoeffs* const out, RunLevel* const rl) {
549 const uint16_t* const bias = Q->bias_;
550 const uint16_t* const iquant = Q->iquant_;
551 int prev = 1;
552 int nb = 0;
553 uint16_t tmp[64], masked[64];
554 for (int i = 0; i < 64; i += 8) {
555 const uint16x8_t m_bias = vld1q_u16(bias + i);
556 const uint16x8_t m_mult = vld1q_u16(iquant + i);
557 const int16x8_t A = vld1q_s16(in + i); // in[i]
558 const uint16x8_t B = vreinterpretq_u16_s16(vabsq_s16(A)); // abs(in)
559 const int16x8_t sign = vshrq_n_s16(A, 15); // sign
560 const uint16x8_t C = vaddq_u16(B, m_bias); // + bias
561 const uint32x4_t D0 = vmull_u16(vget_low_u16(C), vget_low_u16(m_mult));
562 const uint32x4_t D1 = vmull_u16(vget_high_u16(C), vget_high_u16(m_mult));
563 // collect hi-words of the 32b mult result using 'unzip'
564 const uint16x8x2_t E = vuzpq_u16(vreinterpretq_u16_u32(D0),
565 vreinterpretq_u16_u32(D1));
566 const uint16x8_t F = vshrq_n_u16(E.val[1], AC_BITS);
567 const uint16x8_t G = veorq_u16(F, vreinterpretq_u16_s16(sign)); // v ^ mask
568 vst1q_u16(tmp + i, F);
569 vst1q_u16(masked + i, G);
570 }
571 for (int i = 1; i < 64; ++i) {
572 const int j = kZigzag[i];
573 const int v = tmp[j];
574 if (v > 0) {
575 const int n = CalcLog2(v);
576 const uint16_t code = masked[j] & ((1 << n) - 1);
577 rl[nb].level_ = (code << 4) | n;
578 rl[nb].run_ = i - prev;
579 prev = i + 1;
580 ++nb;
581 }
582 }
583 const int dc = (in[0] < 0) ? -tmp[0] : tmp[0];
584 out->idx_ = idx;
585 out->last_ = prev - 1;
586 out->nb_coeffs_ = nb;
587 return dc;
588 }
589 #endif // SJPEG_USE_NEON
590
QuantizeBlock(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)591 static int QuantizeBlock(const int16_t in[64], int idx,
592 const Quantizer* const Q,
593 DCTCoeffs* const out, RunLevel* const rl) {
594 const uint16_t* const bias = Q->bias_;
595 const uint16_t* const iquant = Q->iquant_;
596 int prev = 1;
597 int nb = 0;
598 // This function is speed-critical, so we're using some bit mask
599 // to extract absolute values, instead of sign tests.
600 const uint16_t* const qthresh = Q->qthresh_;
601 for (int i = 1; i < 64; ++i) {
602 const int j = kZigzag[i];
603 int v = in[j];
604 const int32_t mask = v >> 31;
605 v = (v ^ mask) - mask;
606 if (v >= qthresh[j]) {
607 v = QUANTIZE(v, iquant[j], bias[j]);
608 assert(v > 0);
609 const int n = CalcLog2(v);
610 const uint16_t code = (v ^ mask) & ((1 << n) - 1);
611 rl[nb].level_ = (code << 4) | n;
612 rl[nb].run_ = i - prev;
613 prev = i + 1;
614 ++nb;
615 }
616 }
617 const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
618 : QUANTIZE(in[0], iquant[0], bias[0]);
619 out->idx_ = idx;
620 out->last_ = prev - 1;
621 out->nb_coeffs_ = nb;
622 return dc;
623 }
624
625 ////////////////////////////////////////////////////////////////////////////////
626 // Trellis-based quantization
627
628 typedef uint32_t score_t;
629 static const score_t kMaxScore = 0xffffffffu;
630
631 struct TrellisNode {
632 uint32_t code;
633 int nbits;
634 score_t score;
635 uint32_t disto;
636 uint32_t bits;
637 uint32_t run;
638 const TrellisNode* best_prev;
639 int pos;
640 int rank;
641
TrellisNodesjpeg::TrellisNode642 TrellisNode() : score(kMaxScore), best_prev(nullptr) {}
InitSinksjpeg::TrellisNode643 void InitSink() {
644 score = 0u;
645 disto = 0;
646 pos = 0;
647 rank = 0;
648 nbits = 0;
649 bits = 0;
650 }
651 };
652
SearchBestPrev(const TrellisNode * const nodes0,TrellisNode * node,const uint32_t disto0[],const uint32_t codes[],uint32_t lambda)653 static bool SearchBestPrev(const TrellisNode* const nodes0, TrellisNode* node,
654 const uint32_t disto0[], const uint32_t codes[],
655 uint32_t lambda) {
656 bool found = false;
657 assert(codes[0xf0] != 0);
658 const uint32_t base_disto = node->disto + disto0[node->pos - 1];
659 for (const TrellisNode* cur = node - 1; cur >= nodes0; --cur) {
660 const int run = node->pos - 1 - cur->pos;
661 if (run < 0) continue;
662 uint32_t bits = node->nbits;
663 bits += (run >> 4) * (codes[0xf0] & 0xff);
664 const uint32_t sym = ((run & 15) << 4) | node->nbits;
665 assert(codes[sym] != 0);
666 bits += codes[sym] & 0xff;
667 const uint32_t disto = base_disto - disto0[cur->pos];
668 const score_t score = disto + lambda * bits + cur->score;
669 if (score < node->score) {
670 node->score = score;
671 node->disto = disto;
672 node->bits = bits;
673 node->best_prev = cur;
674 node->rank = cur->rank + 1;
675 node->run = run;
676 found = true;
677 }
678 }
679 return found;
680 }
681
682 // number of alternate levels to investigate
683 #define NUM_TRELLIS_NODES 2
684
TrellisQuantizeBlock(const int16_t in[64],int idx,const Quantizer * const Q,DCTCoeffs * const out,RunLevel * const rl)685 int Encoder::TrellisQuantizeBlock(const int16_t in[64], int idx,
686 const Quantizer* const Q,
687 DCTCoeffs* const out,
688 RunLevel* const rl) {
689 const uint16_t* const bias = Q->bias_;
690 const uint16_t* const iquant = Q->iquant_;
691 TrellisNode nodes[1 + NUM_TRELLIS_NODES * 63]; // 1 sink + n channels
692 nodes[0].InitSink();
693 const uint32_t* const codes = Q->codes_;
694 TrellisNode* cur_node = &nodes[1];
695 uint32_t disto0[64]; // disto0[i] = sum of distortions up to i (inclusive)
696 disto0[0] = 0;
697 for (int i = 1; i < 64; ++i) {
698 const int j = kZigzag[i];
699 const uint32_t q = Q->quant_[j] << AC_BITS;
700 const uint32_t lambda = q * q / 32u;
701 int V = in[j];
702 const int32_t mask = V >> 31;
703 V = (V ^ mask) - mask;
704 disto0[i] = V * V + disto0[i - 1];
705 int v = QUANTIZE(V, iquant[j], bias[j]);
706 if (v == 0) continue;
707 int nbits = CalcLog2(v);
708 for (int k = 0; k < NUM_TRELLIS_NODES; ++k) {
709 const int err = V - v * q;
710 cur_node->code = (v ^ mask) & ((1 << nbits) - 1);
711 cur_node->pos = i;
712 cur_node->disto = err * err;
713 cur_node->nbits = nbits;
714 cur_node->score = kMaxScore;
715 if (SearchBestPrev(&nodes[0], cur_node, disto0, codes, lambda)) {
716 ++cur_node;
717 }
718 --nbits;
719 if (nbits <= 0) break;
720 v = (1 << nbits) - 1;
721 }
722 }
723 // search best entry point backward
724 const TrellisNode* nz = &nodes[0];
725 if (cur_node != nz) {
726 score_t best_score = kMaxScore;
727 while (cur_node-- != &nodes[0]) {
728 const uint32_t disto = disto0[63] - disto0[cur_node->pos];
729 // No need to incorporate EOB's bit cost (codes[0x00]), since
730 // it's the same for all coeff except the last one #63.
731 cur_node->disto += disto;
732 cur_node->score += disto;
733 if (cur_node->score < best_score) {
734 nz = cur_node;
735 best_score = cur_node->score;
736 }
737 }
738 }
739 int nb = nz->rank;
740 out->idx_ = idx;
741 out->last_ = nz->pos;
742 out->nb_coeffs_ = nb;
743
744 while (nb-- > 0) {
745 const int32_t code = nz->code;
746 const int n = nz->nbits;
747 rl[nb].level_ = (code << 4) | n;
748 rl[nb].run_ = nz->run;
749 nz = nz->best_prev;
750 }
751 const int dc = (in[0] < 0) ? -QUANTIZE(-in[0], iquant[0], bias[0])
752 : QUANTIZE(in[0], iquant[0], bias[0]);
753 return dc;
754 }
755
GetQuantizeBlockFunc()756 Encoder::QuantizeBlockFunc Encoder::GetQuantizeBlockFunc() {
757 #if defined(SJPEG_USE_SSE2)
758 if (SupportsSSE2()) return QuantizeBlockSSE2;
759 #elif defined(SJPEG_USE_NEON)
760 if (SupportsNEON()) return QuantizeBlockNEON;
761 #endif
762 return QuantizeBlock; // default
763 }
764
765 ////////////////////////////////////////////////////////////////////////////////
766
767 #if defined(SJPEG_USE_SSE2)
768 // Load eight 16b-words from *src.
769 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
770 #define LOAD_64(src) _mm_loadl_epi64((const __m128i*)(src))
771 // Store eight 16b-words into *dst
772 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
773
QuantizeErrorSSE2(const int16_t in[64],const Quantizer * const Q)774 static uint32_t QuantizeErrorSSE2(const int16_t in[64],
775 const Quantizer* const Q) {
776 const uint16_t* const bias = Q->bias_;
777 const uint16_t* const iquant = Q->iquant_;
778 const uint8_t* const quant = Q->quant_;
779 const __m128i zero = _mm_setzero_si128();
780 uint32_t tmp[32];
781 for (int i = 0; i < 64; i += 8) {
782 const __m128i m_bias = LOAD_16(bias + i);
783 const __m128i m_iquant = LOAD_16(iquant + i);
784 const __m128i m_quant = _mm_unpacklo_epi8(LOAD_64(quant + i), zero);
785 const __m128i A = LOAD_16(in + i); // v0 = in[i]
786 const __m128i B = _mm_srai_epi16(A, 15); // sign extract
787 const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(v0)
788 const __m128i D = _mm_adds_epi16(C, m_bias); // v' = v0 + bias
789 const __m128i E = _mm_mulhi_epu16(D, m_iquant); // (v' * iq) >> 16
790 const __m128i F = _mm_srai_epi16(E, AC_BITS);
791 const __m128i G = _mm_srai_epi16(C, AC_BITS);
792 const __m128i H = _mm_mullo_epi16(F, m_quant); // *= quant[j]
793 const __m128i I = _mm_sub_epi16(G, H);
794 const __m128i J = _mm_madd_epi16(I, I); // (v0-v) ^ 2
795 STORE_16(J, tmp + i / 2);
796 }
797 uint32_t err = 0;
798 for (int i = 0; i < 32; ++i) err += tmp[i];
799 return err;
800 }
801 #undef LOAD_16
802 #undef LOAD_64
803 #undef STORE_16
804
805 #elif defined(SJPEG_USE_NEON)
806
QuantizeErrorNEON(const int16_t in[64],const Quantizer * const Q)807 static uint32_t QuantizeErrorNEON(const int16_t in[64],
808 const Quantizer* const Q) {
809 const uint16_t* const bias = Q->bias_;
810 const uint16_t* const iquant = Q->iquant_;
811 const uint8_t* const quant = Q->quant_;
812 uint32x4_t sum1 = vdupq_n_u32(0);
813 uint32x4_t sum2 = vdupq_n_u32(0);
814 for (int i = 0; i < 64; i += 8) {
815 const uint16x8_t m_bias = vld1q_u16(bias + i);
816 const uint16x8_t m_mult = vld1q_u16(iquant + i);
817 const uint16x8_t m_quant = vmovl_u8(vld1_u8(quant + i));
818 const uint16x8_t A = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(in + i)));
819 const uint16x8_t B = vaddq_u16(A, m_bias);
820 const uint32x4_t C0 = vmull_u16(vget_low_u16(B), vget_low_u16(m_mult));
821 const uint32x4_t C1 = vmull_u16(vget_high_u16(B), vget_high_u16(m_mult));
822 // collect hi-words of the 32b mult result using 'unzip'
823 const uint16x8x2_t D = vuzpq_u16(vreinterpretq_u16_u32(C0),
824 vreinterpretq_u16_u32(C1));
825 const uint16x8_t E = vshrq_n_u16(D.val[1], AC_BITS);
826 const uint16x8_t F = vmulq_u16(E, m_quant); // dequantized coeff
827 const uint16x8_t G = vabdq_u16(F, vshrq_n_u16(A, AC_BITS));
828 sum1 = vmlal_u16(sum1, vget_low_u16(G), vget_low_u16(G));
829 sum2 = vmlal_u16(sum2, vget_high_u16(G), vget_high_u16(G));
830 }
831 const uint32x4_t sum3 = vaddq_u32(sum1, sum2);
832 const uint64x2_t sum4 = vpaddlq_u32(sum3);
833 const uint64_t sum5 = vgetq_lane_u64(sum4, 0) + vgetq_lane_u64(sum4, 1);
834 const uint32_t err = (uint32_t)sum5;
835 return err;
836 }
837
838 #endif // SJPEG_USE_NEON
839
QuantizeError(const int16_t in[64],const Quantizer * const Q)840 static uint32_t QuantizeError(const int16_t in[64], const Quantizer* const Q) {
841 const uint16_t* const bias = Q->bias_;
842 const uint16_t* const iquant = Q->iquant_;
843 const uint8_t* const quant = Q->quant_;
844 uint32_t err = 0;
845 for (int j = 0; j < 64; ++j) {
846 int32_t v0 = (in[j] < 0) ? -in[j] : in[j];
847 const uint32_t v = quant[j] * QUANTIZE(v0, iquant[j], bias[j]);
848 v0 >>= AC_BITS;
849 err += (v0 - v) * (v0 - v);
850 }
851 return err;
852 }
853
GetQuantizeErrorFunc()854 Encoder::QuantizeErrorFunc Encoder::GetQuantizeErrorFunc() {
855 #if defined(SJPEG_USE_SSE2)
856 if (SupportsSSE2()) return QuantizeErrorSSE2;
857 #elif defined(SJPEG_USE_NEON)
858 if (SupportsNEON()) return QuantizeErrorNEON;
859 #endif
860 return QuantizeError; // default
861 }
862
863 ////////////////////////////////////////////////////////////////////////////////
864 // Code bitstream
865
ResetDCs()866 void Encoder::ResetDCs() {
867 for (int c = 0; c < nb_comps_; ++c) {
868 DCs_[c] = 0;
869 }
870 }
871
CodeBlock(const DCTCoeffs * const coeffs,const RunLevel * const rl)872 void Encoder::CodeBlock(const DCTCoeffs* const coeffs,
873 const RunLevel* const rl) {
874 const int idx = coeffs->idx_;
875 const int q_idx = quant_idx_[idx];
876
877 // DC coefficient symbol
878 const int dc_len = coeffs->dc_code_ & 0x0f;
879 const uint32_t code = dc_codes_[q_idx][dc_len];
880 bw_.PutPackedCode(code);
881 if (dc_len > 0) {
882 bw_.PutBits(coeffs->dc_code_ >> 4, dc_len);
883 }
884
885 // AC coeffs
886 const uint32_t* const codes = ac_codes_[q_idx];
887 for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
888 int run = rl[i].run_;
889 while (run & ~15) { // escapes
890 bw_.PutPackedCode(codes[0xf0]);
891 run -= 16;
892 }
893 const uint32_t suffix = rl[i].level_;
894 const int n = suffix & 0x0f;
895 const int sym = (run << 4) | n;
896 bw_.PutPackedCode(codes[sym]);
897 bw_.PutBits(suffix >> 4, n);
898 }
899 if (coeffs->last_ < 63) { // EOB
900 bw_.PutPackedCode(codes[0x00]);
901 }
902 }
903
904 ////////////////////////////////////////////////////////////////////////////////
905 // Histogram
906
ResetHisto()907 void Encoder::ResetHisto() {
908 memset(histos_, 0, sizeof(histos_));
909 }
910
911 #if defined(SJPEG_USE_SSE2)
StoreHistoSSE2(const int16_t in[64],Histo * const histos,int nb_blocks)912 void StoreHistoSSE2(const int16_t in[64], Histo* const histos, int nb_blocks) {
913 const __m128i kMaxHisto = _mm_set1_epi16(MAX_HISTO_DCT_COEFF);
914 for (int n = 0; n < nb_blocks; ++n, in += 64) {
915 uint16_t tmp[64];
916 for (int i = 0; i < 64; i += 8) {
917 const __m128i A =
918 _mm_loadu_si128(reinterpret_cast<const __m128i*>(in + i));
919 const __m128i B = _mm_srai_epi16(A, 15); // sign extract
920 const __m128i C = _mm_sub_epi16(_mm_xor_si128(A, B), B); // abs(A)
921 const __m128i D = _mm_srli_epi16(C, HSHIFT); // >>= HSHIFT
922 const __m128i E = _mm_min_epi16(D, kMaxHisto);
923 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + i), E);
924 }
925 for (int j = 0; j < 64; ++j) {
926 const int k = tmp[j];
927 ++histos->counts_[j][k];
928 }
929 }
930 }
931 #elif defined(SJPEG_USE_NEON)
StoreHistoNEON(const int16_t in[64],Histo * const histos,int nb_blocks)932 void StoreHistoNEON(const int16_t in[64], Histo* const histos, int nb_blocks) {
933 const uint16x8_t kMaxHisto = vdupq_n_u16(MAX_HISTO_DCT_COEFF);
934 for (int n = 0; n < nb_blocks; ++n, in += 64) {
935 uint16_t tmp[64];
936 for (int i = 0; i < 64; i += 8) {
937 const int16x8_t A = vld1q_s16(in + i);
938 const int16x8_t B = vabsq_s16(A); // abs(in)
939 const uint16x8_t C = vreinterpretq_u16_s16(B); // signed->unsigned
940 const uint16x8_t D = vshrq_n_u16(C, HSHIFT); // >>= HSHIFT
941 const uint16x8_t E = vminq_u16(D, kMaxHisto); // min(.,kMaxHisto)
942 vst1q_u16(tmp + i, E);
943 }
944 for (int j = 0; j < 64; ++j) {
945 const int k = tmp[j];
946 ++histos->counts_[j][k];
947 }
948 }
949 }
950 #endif
951
952 // This C-version is does not produce the same counts_[] output than the
953 // assembly above. But the extra entry counts_[MAX_HISTO_DCT_COEFF] is
954 // not used for the final computation, and the global result is unchanged.
StoreHisto(const int16_t in[64],Histo * const histos,int nb_blocks)955 void StoreHisto(const int16_t in[64], Histo* const histos, int nb_blocks) {
956 for (int n = 0; n < nb_blocks; ++n, in += 64) {
957 for (int i = 0; i < 64; ++i) {
958 const int k = (in[i] < 0 ? -in[i] : in[i]) >> HSHIFT;
959 if (k < MAX_HISTO_DCT_COEFF) {
960 ++histos->counts_[i][k];
961 }
962 }
963 }
964 }
965
GetStoreHistoFunc()966 Encoder::StoreHistoFunc Encoder::GetStoreHistoFunc() {
967 #if defined(SJPEG_USE_SSE2)
968 if (SupportsSSE2()) return StoreHistoSSE2;
969 #elif defined(SJPEG_USE_NEON)
970 if (SupportsNEON()) return StoreHistoNEON;
971 #endif
972 return StoreHisto; // default
973 }
974
975 const float Encoder::kHistoWeight[QSIZE] = {
976 // Gaussian with sigma ~= 3
977 0, 0, 0, 0, 0,
978 1, 5, 16, 43, 94, 164, 228, 255, 228, 164, 94, 43, 16, 5, 1,
979 0, 0, 0, 0, 0
980 };
981
AnalyseHisto()982 void Encoder::AnalyseHisto() {
983 // A bit of theory and background: for each sub-band i in [0..63], we pick a
984 // quantization scale New_Qi close to the initial one Qi. We evaluate a cost
985 // function associated with F({New_Qi}) = distortion + lambda . rate,
986 // where rate and distortion depend on the quantizers set in a complex non-
987 // analytic way. Just, for well-behaved regular histograms, we expect the
988 // rate to scale as -log(Q), and the distortion as Q^2.
989 // We want the cost function to be stationnary around the initial {Qi} set,
990 // in order to achieve the best transfer between distortion and rate when we
991 // displace a little the Qi values. Mainly we want to use bits as efficiently
992 // as possible, where every bit we use has maximal impact in lowering
993 // distortion (and vice versa: if we spend an extra bit of coding, we want to
994 // have the best bang for this buck. The optimization works up-hill too).
995 //
996 // Hence, lambda is picked to minimize F around {Qi}, as:
997 // lambda = -d(distortion) / d(rate)
998 // where the derivates are evaluated using a double least-square fit on both
999 // the clouds of {delta, distortion} and {delta, size} points.
1000 //
1001 // Note1: The least-square fitted slope of a {x,y} cloud is expressed as:
1002 // slope = (<xy> - <x><y>) / (<xx> - <x><x>) = Cov(x,y) / Cov(x,x)
1003 // where <.> is our gaussian-averaging operator.
1004 // But since we are eventually computing a quotient of such slopes, we can
1005 // factor out the common (<xx> - <x><x>) denominator (which is strictly
1006 // positive).
1007 // Note2: we use a Gaussian-weighted average around the center value Qi
1008 // instead of averaging over the whole [QDELTA_MIN, QDELTA_MAX] range.
1009 // This rules out fringe samples on noisy cases (like: when the source is
1010 // already JPEG-compressed!).
1011 // Note3: We fall back to some sane value HLAMBDA in case of ill-condition.
1012 //
1013 // We use use the correlation coefficient
1014 // r = Cov(x,y) / sqrt(Cov(x,x) * Cov(y,y))
1015 // to detect bad cases with poorly extrapolated distortion. In such
1016 // occurrence, we skip the channel. This is particularly important for
1017 // already-compressed JPEG sources that give treacherous comb-like
1018 // histograms.
1019 //
1020 // Once this particular lambda has been picked, we loop over each channel
1021 // and optimize them separately, locally picking the best New_Qi for each.
1022 // The choice of lambda ensure a good balancing between size and distortion,
1023 // and prevent being too aggressive on file-size reduction for instance.
1024 //
1025 const double r_limit = kCorrelationThreshold;
1026 for (int c = (nb_comps_ > 1 ? 1 : 0); c >= 0; --c) {
1027 const int idx = quant_idx_[c];
1028 const Histo* const histo = &histos_[idx];
1029 // For chrominance, it can be visually damageable to be too
1030 // aggressive on the filesize. So with the default settings we
1031 // restrict the algorithm to mainly try to *increase* the bitrate
1032 // (and quality) by using a smaller qdelta_max_chroma_.
1033 // delta_max is only use during the second phase, but not during
1034 // the first phase of deriving an optimal lambda.
1035 assert(QDELTA_MAX >= qdelta_max_luma_);
1036 assert(QDELTA_MAX >= qdelta_max_chroma_);
1037 const int delta_max =
1038 ((idx == 0) ? qdelta_max_luma_ : qdelta_max_chroma_) - QDELTA_MIN;
1039 assert(delta_max < QSIZE);
1040 float sizes[64][QSIZE];
1041 float distortions[64][QSIZE];
1042 double num = 0.; // accumulate d(distortion) around delta_q = 0
1043 double den = 0.; // accumulate d(size) around delta_q = 0
1044 uint64_t omit_channels = kOmittedChannels;
1045 for (int pos = 0; pos < 64; ++pos) {
1046 if (omit_channels & (1ULL << pos)) {
1047 continue;
1048 }
1049 const int dq0 = quants_[idx].quant_[pos];
1050 const int min_dq0 = quants_[idx].min_quant_[pos];
1051 // We should be using the exact bias:
1052 // const int bias = quants_[idx].bias_[pos] << (FP_BITS - AC_BITS);
1053 // but this value is too precise considering the other approximations
1054 // we're using (namely: HSHIFT). So we better use the a mid value of 0.5
1055 // for the bias. This have the advantage of making it possible to
1056 // use pre-calculated look-up tables for every quantities in the loop.
1057 // This is still a TODO(skal) below, though. Not sure the gain is big.
1058 const int bias = 1 << FP_BITS >> 1;
1059 const int* const h = histo->counts_[pos];
1060 int total = 0;
1061 int last = 0;
1062 for (int i = 0; i < MAX_HISTO_DCT_COEFF; ++i) {
1063 total += h[i];
1064 if (h[i]) last = i + 1;
1065 }
1066 if (total < kDensityThreshold * last) {
1067 omit_channels |= 1ULL << pos;
1068 continue;
1069 }
1070 // accumulators for averaged values.
1071 double sw = 0., sx = 0.;
1072 double sxx = 0., syy1 = 0.;
1073 double sy1 = 0., sxy1 = 0.; // accumulators for distortion cloud
1074 double sy2 = 0., sxy2 = 0.; // accumulators for size cloud
1075 for (int delta = 0; delta < QSIZE; ++delta) {
1076 double bsum = 0., dsum = 0.;
1077 const int dq = dq0 + (delta + QDELTA_MIN);
1078 if (dq >= min_dq0 && dq <= 255) {
1079 // TODO(skal): pre-compute idq and use it in FinalizeQuantMatrix too
1080 const int idq = ((1 << FP_BITS) + dq - 1) / dq;
1081 for (int i = 0; i < last; ++i) {
1082 if (h[i]) {
1083 // v = current bin's centroid in the histogram
1084 // qv = quantized value for the bin's representant 'v'
1085 // dqv = dequantized qv, to be compared against v (=> 'error')
1086 // bits = approximate bit-cost of quantized representant
1087 // h[i] = this bin's weight
1088 const int v = (i << HSHIFT) + HHALF;
1089 const int qv = (v * idq + bias) >> FP_BITS;
1090 // TODO(skal): for a given 'last' value, we know the upper limit
1091 // on dq that will make *all* quantized 'qv' values be zero.
1092 // => We can restrict the loop on 'dq' using 'last'.
1093 if (qv) {
1094 const int bits = CalcLog2(qv);
1095 const int dqv = qv * dq;
1096 const int error = (v - dqv) * (v - dqv);
1097 bsum += h[i] * bits;
1098 dsum += h[i] * error;
1099 } else {
1100 dsum += h[i] * v * v;
1101 }
1102 }
1103 } // end of 'i' loop
1104 distortions[pos][delta] = static_cast<float>(dsum);
1105 sizes[pos][delta] = static_cast<float>(bsum);
1106 const double w = kHistoWeight[delta]; // Gaussian weight
1107 if (w > 0.) {
1108 const double x = static_cast<double>(delta + QDELTA_MIN);
1109 sw += w;
1110 sx += w * x;
1111 sxx += w * x * x;
1112 sy1 += w * dsum;
1113 syy1 += w * dsum * dsum;
1114 sy2 += w * bsum;
1115 sxy1 += w * dsum * x;
1116 sxy2 += w * bsum * x;
1117 }
1118 } else { // the new quantizer is out-of-range.
1119 distortions[pos][delta] = FLT_MAX;
1120 sizes[pos][delta] = 0;
1121 }
1122 }
1123 // filter channels according to correlation factor.
1124 const double cov_xy1 = sw * sxy1 - sx * sy1;
1125 if (cov_xy1 * cov_xy1 < r_limit *
1126 (sw * sxx - sx * sx) * (sw * syy1 - sy1 * sy1)) {
1127 omit_channels |= 1ULL << pos;
1128 continue;
1129 }
1130 // accumulate numerator and denominator for the derivate calculation
1131 num += cov_xy1;
1132 den += sw * sxy2 - sx * sy2;
1133 }
1134
1135 // we evaluate lambda =~ -d(distortion)/d(size) at dq=0
1136 double lambda = HLAMBDA;
1137 // When increasing Q, size should significantly decrease and distortion
1138 // increase. If they don't, we are ill-conditionned and should fall back
1139 // to a safe value HLAMBDA.
1140 if (num > 1000. && den < -10.) {
1141 // This is our approximation of -d(Distortion) / d(Rate)
1142 // We limit it to 1. below, to avoid degenerated cases
1143 lambda = -num / den;
1144 if (lambda < 1.) {
1145 lambda = 1.;
1146 }
1147 }
1148 // now, optimize each channel using the optimal lambda selection
1149 for (int pos = 0; pos < 64; ++pos) {
1150 if (omit_channels & (1ULL << pos)) {
1151 continue;
1152 }
1153 float best_score = FLT_MAX;
1154 int best_dq = 0;
1155 for (int delta = 0; delta <= delta_max; ++delta) {
1156 if (distortions[pos][delta] < FLT_MAX) {
1157 const float score = distortions[pos][delta]
1158 + lambda * sizes[pos][delta];
1159 if (score < best_score) {
1160 best_score = score;
1161 best_dq = delta + QDELTA_MIN;
1162 }
1163 }
1164 }
1165 quants_[idx].quant_[pos] += best_dq;
1166 assert(quants_[idx].quant_[pos] >= 1);
1167 }
1168 FinalizeQuantMatrix(&quants_[idx], q_bias_);
1169 SetCostCodes(idx);
1170 }
1171 }
1172
CollectHistograms()1173 void Encoder::CollectHistograms() {
1174 ResetHisto();
1175 int16_t* in = in_blocks_;
1176 const int mb_x_max = W_ / block_w_;
1177 const int mb_y_max = H_ / block_h_;
1178 for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1179 const bool yclip = (mb_y == mb_y_max);
1180 for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1181 if (!use_extra_memory_) {
1182 in = in_blocks_;
1183 }
1184 GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1185 fDCT_(in, mcu_blocks_);
1186 for (int c = 0; c < nb_comps_; ++c) {
1187 const int num_blocks = nb_blocks_[c];
1188 store_histo_(in, &histos_[quant_idx_[c]], num_blocks);
1189 in += 64 * num_blocks;
1190 }
1191 }
1192 }
1193 have_coeffs_ = use_extra_memory_;
1194 }
1195
1196 ////////////////////////////////////////////////////////////////////////////////
1197 // Perform YUV conversion and fDCT, and store the unquantized coeffs
1198
CollectCoeffs()1199 void Encoder::CollectCoeffs() {
1200 assert(use_extra_memory_);
1201 int16_t* in = in_blocks_;
1202 const int mb_x_max = W_ / block_w_;
1203 const int mb_y_max = H_ / block_h_;
1204 for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1205 const bool yclip = (mb_y == mb_y_max);
1206 for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1207 GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1208 fDCT_(in, mcu_blocks_);
1209 in += 64 * mcu_blocks_;
1210 }
1211 }
1212 have_coeffs_ = true;
1213 }
1214
1215 ////////////////////////////////////////////////////////////////////////////////
1216 // 1-pass Scan
1217
SinglePassScan()1218 void Encoder::SinglePassScan() {
1219 ResetDCs();
1220
1221 RunLevel base_run_levels[64];
1222 int16_t* in = in_blocks_;
1223 const int mb_x_max = W_ / block_w_;
1224 const int mb_y_max = H_ / block_h_;
1225 const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1226 : quantize_block_;
1227 for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1228 const bool yclip = (mb_y == mb_y_max);
1229 for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1230 if (!CheckBuffers()) return;
1231 if (!have_coeffs_) {
1232 in = in_blocks_;
1233 GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1234 fDCT_(in, mcu_blocks_);
1235 }
1236 for (int c = 0; c < nb_comps_; ++c) {
1237 DCTCoeffs base_coeffs;
1238 for (int i = 0; i < nb_blocks_[c]; ++i) {
1239 const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1240 &base_coeffs, base_run_levels);
1241 base_coeffs.dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1242 CodeBlock(&base_coeffs, base_run_levels);
1243 in += 64;
1244 }
1245 }
1246 }
1247 }
1248 }
1249
FinalPassScan(size_t nb_mbs,const DCTCoeffs * coeffs)1250 void Encoder::FinalPassScan(size_t nb_mbs, const DCTCoeffs* coeffs) {
1251 DesallocateBlocks(); // we can free up some coeffs memory at this point
1252 if (!CheckBuffers()) return; // call needed to finalize all_run_levels_
1253 assert(reuse_run_levels_);
1254 const RunLevel* run_levels = all_run_levels_;
1255 for (size_t n = 0; n < nb_mbs; ++n) {
1256 if (!CheckBuffers()) return;
1257 CodeBlock(&coeffs[n], run_levels);
1258 run_levels += coeffs[n].nb_coeffs_;
1259 }
1260 }
1261
1262 ////////////////////////////////////////////////////////////////////////////////
1263 // Huffman tables optimization
1264
ResetEntropyStats()1265 void Encoder::ResetEntropyStats() {
1266 memset(freq_ac_, 0, sizeof(freq_ac_));
1267 memset(freq_dc_, 0, sizeof(freq_dc_));
1268 }
1269
AddEntropyStats(const DCTCoeffs * const coeffs,const RunLevel * const run_levels)1270 void Encoder::AddEntropyStats(const DCTCoeffs* const coeffs,
1271 const RunLevel* const run_levels) {
1272 // freq_ac_[] and freq_dc_[] cannot overflow 32bits, since the maximum
1273 // resolution allowed is 65535 * 65535. The sum of all frequencies cannot
1274 // be greater than 32bits, either.
1275 const int idx = coeffs->idx_;
1276 const int q_idx = quant_idx_[idx];
1277 for (int i = 0; i < coeffs->nb_coeffs_; ++i) {
1278 const int run = run_levels[i].run_;
1279 const int tmp = (run >> 4);
1280 if (tmp) freq_ac_[q_idx][0xf0] += tmp; // count escapes (all at once)
1281 const int suffix = run_levels[i].level_;
1282 const int sym = ((run & 0x0f) << 4) | (suffix & 0x0f);
1283 ++freq_ac_[q_idx][sym];
1284 }
1285 if (coeffs->last_ < 63) { // EOB
1286 ++freq_ac_[q_idx][0x00];
1287 }
1288 ++freq_dc_[q_idx][coeffs->dc_code_ & 0x0f];
1289 }
1290
cmp(const void * pa,const void * pb)1291 static int cmp(const void *pa, const void *pb) {
1292 const uint64_t a = *reinterpret_cast<const uint64_t*>(pa);
1293 const uint64_t b = *reinterpret_cast<const uint64_t*>(pb);
1294 assert(a != b); // tie-breaks can't happen
1295 return (a < b) ? 1 : -1;
1296 }
1297
BuildOptimalTable(HuffmanTable * const t,const uint32_t * const freq,int size)1298 static void BuildOptimalTable(HuffmanTable* const t,
1299 const uint32_t* const freq, int size) {
1300 enum { MAX_BITS = 32, MAX_CODE_SIZE = 16 };
1301 assert(size <= 256);
1302 assert(t != nullptr);
1303
1304 // The celebrated merging algorithm from Huffman, with some restrictions:
1305 // * codes with all '1' are forbidden, to avoid trailing marker emulation
1306 // * code should be less than 16bits. So we're re-allocating them to shorter
1307 // code, even if it means being suboptimal for extremely rare symbols that
1308 // would eat a lot of bits.
1309 // This function will not touch the content of freq[].
1310 int codesizes[256 + 1];
1311 // chain[i] will hold the index of the next element in the subtree below
1312 // element 'i', or -1 if there's no sub-tree.
1313 // We use and maintain this list in order to efficiently increasing the
1314 // codesizes by one when merging two sub-trees into one.
1315 // To ease the merging (by avoiding 1 loop) we store the address of the last
1316 // element in the chain for each symbol. This makes the process being O(1).
1317 // It's probably better to keep the arrays separated instead of making
1318 // a struct, since we touch chain_end[] only once per merging, whereas
1319 // chain[] and codesizes[] are modified O(k) time per merging.
1320 int chain[256 + 1];
1321 int* chain_end[256 + 1];
1322 // sorted_freq[] remains sorted by decreasing frequencies along the process.
1323 uint64_t sorted_freq[256 + 1];
1324
1325 // Counts and puts the symbols effectively used at the beginning of the table.
1326 int nb_syms = 0;
1327 for (int i = 0; i < size; ++i) {
1328 const uint64_t v = freq[i];
1329 if (v > 0) {
1330 // we pack the sorted key (32bits) and index (9bits) into a single
1331 // uint64_t, so we don't have to resort to structs (and we avoid
1332 // tie-breaks, too)
1333 sorted_freq[nb_syms++] = (v << 9) | i;
1334 }
1335 codesizes[i] = 0;
1336 chain[i] = -1;
1337 chain_end[i] = &chain[i];
1338 }
1339 t->nb_syms_ = nb_syms; // Record how many final symbols we'll have.
1340
1341 // initial sort
1342 // TODO(skal): replace by counting-sort?? (merged with previous loop?)
1343 qsort(sorted_freq, nb_syms, sizeof(sorted_freq[0]), cmp);
1344
1345 // fake last symbol, with lowest frequency: will be assigned to the forbidden
1346 // code '1111...1', but will eventually be discarded.
1347 sorted_freq[nb_syms++] = (1ULL << 9) | size;
1348 codesizes[size] = 0;
1349 chain[size] = -1;
1350 chain_end[size] = &chain[size];
1351
1352 // Merging phase
1353 // Recursively merge the two symbols with lowest frequency. The resulting
1354 // super-symbol will be represented by a longer (by 1bit) code, since
1355 // it's the least frequent one.
1356 int nb = nb_syms;
1357 while (nb-- > 1) {
1358 // First, link the two sub-trees.
1359 const uint64_t s1 = sorted_freq[nb - 1]; // first symbol
1360 const uint64_t s2 = sorted_freq[nb]; // second symbol, appended
1361 // The 0x1ff masking is for taking only the symbol, discarding the
1362 // frequency that we stored in the upper bits for sorting.
1363 int i = s1 & 0x1ff;
1364 const int j = s2 & 0x1ff;
1365 assert(i <= size && j <= size);
1366 *chain_end[i] = j;
1367 chain_end[i] = chain_end[j];
1368
1369 // Then, following the chain, increase the whole sub-tree's weight by 1bit.
1370 do {
1371 ++codesizes[i];
1372 i = chain[i];
1373 } while (i >= 0);
1374
1375 // Create new symbol, with merged frequencies. Will take s1's spot.
1376 // We must use 64bit here to prevent overflow in the sum. Both s1 and
1377 // s2 are originally 32 + 9 bits wide.
1378 const uint64_t new_symbol = s1 + (s2 & ~0x1ff);
1379 // Perform insertion sort to find the new spot of the merged symbol.
1380 int k = nb - 1;
1381 while (k > 0) {
1382 if (sorted_freq[k - 1] < new_symbol) {
1383 sorted_freq[k] = sorted_freq[k - 1];
1384 --k;
1385 } else {
1386 break;
1387 }
1388 }
1389 sorted_freq[k] = new_symbol;
1390 }
1391
1392 // Count bit distribution.
1393 uint8_t bits[MAX_BITS];
1394 memset(bits, 0, sizeof(bits));
1395 int max_bit_size = 0;
1396 for (int i = 0; i <= size; ++i) {
1397 int s = codesizes[i];
1398 assert(s <= codesizes[size]); // symbol #size is the biggest one.
1399 if (s > 0) {
1400 // This is slightly penalizing but only for ultra-rare symbol
1401 if (s > MAX_BITS) {
1402 s = MAX_BITS;
1403 codesizes[i] = MAX_BITS; // clamp code-size
1404 }
1405 ++bits[s - 1];
1406 if (s > max_bit_size) {
1407 max_bit_size = s;
1408 }
1409 }
1410 }
1411
1412 // We sort symbols by slices of increasing bitsizes, using counting sort.
1413 // This will generate a partition of symbols in the final syms_[] array.
1414 int start[MAX_BITS]; // start[i] is the first code with length i+1
1415 int position = 0;
1416 for (int i = 0; i < max_bit_size; ++i) {
1417 start[i] = position;
1418 position += bits[i];
1419 }
1420 assert(position == nb_syms);
1421
1422 // Now, we can ventilate the symbols directly to their final slice in the
1423 // partitioning, according to the their bit-length.
1424 // Note: we omit the last symbol, which is fake.
1425 uint8_t* const syms = const_cast<uint8_t*>(t->syms_);
1426 // Note that we loop til symbol = size-1, hence omitting the last fake symbol.
1427 for (int symbol = 0; symbol < size; ++symbol) {
1428 const int s = codesizes[symbol];
1429 if (s > 0) {
1430 assert(s <= MAX_BITS);
1431 syms[start[s - 1]++] = symbol;
1432 }
1433 }
1434 assert(start[max_bit_size - 1] == nb_syms - 1);
1435
1436 // Fix codes with length greater than 16 bits. We move too long
1437 // codes up, and one short down, making the tree a little sub-optimal.
1438 for (int l = max_bit_size - 1; l >= MAX_CODE_SIZE; --l) {
1439 while (bits[l] > 0) {
1440 int k = l - 2;
1441 while (bits[k] == 0) { // Search for a level with a leaf to split.
1442 --k;
1443 }
1444 /* Move up 2 symbols from bottom-most level l, and sink down one from
1445 level k, like this:
1446 Before: After:
1447 / .. / ..
1448 k bits-> c \ /\ \
1449 /\ c b /\
1450 .. /\ .. a
1451 l bits-> a b
1452 Note that by the very construction of the optimal tree, the least
1453 probable symbols always come by pair with same bit-length.
1454 So there's always a pair of 'a' and 'b' to find.
1455 */
1456 bits[l ] -= 2; // remove 'a' and 'b'
1457 bits[l - 1] += 1; // put 'a' one level up.
1458 bits[k ] -= 1; // remove 'c'
1459 bits[k + 1] += 2; // put 'c' anb 'b' one level down.
1460 }
1461 }
1462
1463 // remove last pseudo-symbol
1464 max_bit_size = MAX_CODE_SIZE;
1465 while (bits[--max_bit_size] == 0) {
1466 assert(max_bit_size > 0);
1467 }
1468 --bits[max_bit_size];
1469
1470 // update table with final book
1471 for (int i = 0; i < MAX_CODE_SIZE; ++i) {
1472 t->bits_[i] = bits[i];
1473 }
1474 }
1475
CompileEntropyStats()1476 void Encoder::CompileEntropyStats() {
1477 // plug and build new tables
1478 for (int q_idx = 0; q_idx < (nb_comps_ == 1 ? 1 : 2); ++q_idx) {
1479 // DC tables
1480 Huffman_tables_[q_idx] = &opt_tables_dc_[q_idx];
1481 opt_tables_dc_[q_idx].syms_ = opt_syms_dc_[q_idx];
1482 BuildOptimalTable(&opt_tables_dc_[q_idx], freq_dc_[q_idx], 12);
1483 // AC tables
1484 Huffman_tables_[2 + q_idx] = &opt_tables_ac_[q_idx];
1485 opt_tables_ac_[q_idx].syms_ = opt_syms_ac_[q_idx];
1486 BuildOptimalTable(&opt_tables_ac_[q_idx], freq_ac_[q_idx], 256);
1487 }
1488 }
1489
StoreOptimalHuffmanTables(size_t nb_mbs,const DCTCoeffs * coeffs)1490 void Encoder::StoreOptimalHuffmanTables(size_t nb_mbs,
1491 const DCTCoeffs* coeffs) {
1492 // optimize Huffman tables
1493 ResetEntropyStats();
1494 const RunLevel* run_levels = all_run_levels_;
1495 for (size_t n = 0; n < nb_mbs; ++n) {
1496 AddEntropyStats(&coeffs[n], run_levels);
1497 run_levels += coeffs[n].nb_coeffs_;
1498 }
1499 CompileEntropyStats();
1500 }
1501
1502 ////////////////////////////////////////////////////////////////////////////////
1503
SinglePassScanOptimized()1504 void Encoder::SinglePassScanOptimized() {
1505 const size_t nb_mbs = mb_w_ * mb_h_ * mcu_blocks_;
1506 DCTCoeffs* const base_coeffs =
1507 Alloc<DCTCoeffs>(reuse_run_levels_ ? nb_mbs : 1);
1508 if (base_coeffs == nullptr) return;
1509 DCTCoeffs* coeffs = base_coeffs;
1510 RunLevel base_run_levels[64];
1511 const QuantizeBlockFunc quantize_block = use_trellis_ ? TrellisQuantizeBlock
1512 : quantize_block_;
1513
1514 // We use the default Huffman tables as basis for bit-rate evaluation
1515 if (use_trellis_) InitCodes(true);
1516
1517 ResetEntropyStats();
1518 ResetDCs();
1519 nb_run_levels_ = 0;
1520 int16_t* in = in_blocks_;
1521 const int mb_x_max = W_ / block_w_;
1522 const int mb_y_max = H_ / block_h_;
1523 for (int mb_y = 0; mb_y < mb_h_; ++mb_y) {
1524 const bool yclip = (mb_y == mb_y_max);
1525 for (int mb_x = 0; mb_x < mb_w_; ++mb_x) {
1526 if (!have_coeffs_) {
1527 in = in_blocks_;
1528 GetSamples(mb_x, mb_y, yclip | (mb_x == mb_x_max), in);
1529 fDCT_(in, mcu_blocks_);
1530 }
1531 if (!CheckBuffers()) goto End;
1532 for (int c = 0; c < nb_comps_; ++c) {
1533 for (int i = 0; i < nb_blocks_[c]; ++i) {
1534 RunLevel* const run_levels =
1535 reuse_run_levels_ ? all_run_levels_ + nb_run_levels_
1536 : base_run_levels;
1537 const int dc = quantize_block(in, c, &quants_[quant_idx_[c]],
1538 coeffs, run_levels);
1539 coeffs->dc_code_ = GenerateDCDiffCode(dc, &DCs_[c]);
1540 AddEntropyStats(coeffs, run_levels);
1541 if (reuse_run_levels_) {
1542 nb_run_levels_ += coeffs->nb_coeffs_;
1543 ++coeffs;
1544 assert(coeffs <= &base_coeffs[nb_mbs]);
1545 }
1546 in += 64;
1547 assert(nb_run_levels_ <= max_run_levels_);
1548 }
1549 }
1550 }
1551 }
1552
1553 CompileEntropyStats();
1554 WriteDHT();
1555 WriteSOS();
1556
1557 if (!reuse_run_levels_) {
1558 SinglePassScan(); // redo everything, but with optimal tables now.
1559 } else {
1560 // Re-use the saved run/levels for fast 2nd-pass.
1561 FinalPassScan(nb_mbs, base_coeffs);
1562 }
1563 End:
1564 Free(base_coeffs);
1565 }
1566
1567 ////////////////////////////////////////////////////////////////////////////////
1568 // main call
1569
Encode()1570 bool Encoder::Encode() {
1571 if (!ok_) return false;
1572
1573 FinalizeQuantMatrix(&quants_[0], q_bias_);
1574 FinalizeQuantMatrix(&quants_[1], q_bias_);
1575 SetCostCodes(0);
1576 SetCostCodes(1);
1577
1578 // default tables
1579 for (int i = 0; i < 4; ++i) Huffman_tables_[i] = &kHuffmanTables[i];
1580
1581 // colorspace init
1582 InitComponents();
1583 assert(nb_comps_ <= MAX_COMP);
1584 assert(mcu_blocks_ <= 6);
1585 // validate some input parameters
1586 if (W_ <= 0 || H_ <= 0 || rgb_ == nullptr) return false;
1587
1588 mb_w_ = (W_ + (block_w_ - 1)) / block_w_;
1589 mb_h_ = (H_ + (block_h_ - 1)) / block_h_;
1590 const size_t nb_blocks = use_extra_memory_ ? mb_w_ * mb_h_ : 1;
1591 if (!AllocateBlocks(nb_blocks * mcu_blocks_)) return false;
1592
1593 WriteAPP0();
1594
1595 // custom markers written 'as is'
1596 if (!WriteAPPMarkers(app_markers_)) return false;
1597
1598 // metadata
1599 if (!WriteEXIF(exif_) || !WriteICCP(iccp_) || !WriteXMP(xmp_)) return false;
1600
1601 if (passes_ > 1) {
1602 LoopScan();
1603 } else {
1604 if (use_adaptive_quant_) {
1605 // Histogram analysis + derive optimal quant matrices
1606 CollectHistograms();
1607 AnalyseHisto();
1608 }
1609
1610 WriteDQT();
1611 WriteSOF();
1612
1613 if (optimize_size_) {
1614 SinglePassScanOptimized();
1615 } else {
1616 WriteDHT();
1617 WriteSOS();
1618 SinglePassScan();
1619 }
1620 }
1621 WriteEOI();
1622 ok_ = ok_ && bw_.Finalize();
1623
1624 DesallocateBlocks();
1625 return ok_;
1626 }
1627
1628 ////////////////////////////////////////////////////////////////////////////////
1629 // Edge replication
1630
1631 namespace {
1632
GetAverage(const int16_t * const out)1633 int GetAverage(const int16_t* const out) {
1634 int DC = 0;
1635 for (int i = 0; i < 64; ++i) DC += out[i];
1636 return (DC + 32) >> 6;
1637 }
1638
SetAverage(int DC,int16_t * const out)1639 void SetAverage(int DC, int16_t* const out) {
1640 for (int i = 0; i < 64; ++i) out[i] = DC;
1641 }
1642
1643 } // anonymous namespace
1644
AverageExtraLuma(int sub_w,int sub_h,int16_t * out)1645 void Encoder::AverageExtraLuma(int sub_w, int sub_h, int16_t* out) {
1646 // out[] points to four 8x8 blocks. When one of this block is totally
1647 // outside of the frame, we set it flat to the average value of the previous
1648 // block ("DC"), in order to help compressibility.
1649 int DC = GetAverage(out);
1650 if (sub_w <= 8) { // set block #1 to block #0's average value
1651 SetAverage(DC, out + 1 * 64);
1652 }
1653 if (sub_h <= 8) { // Need to flatten block #2 and #3
1654 if (sub_w > 8) { // block #1 was not flatten, so get its real DC
1655 DC = GetAverage(out + 1 * 64);
1656 }
1657 SetAverage(DC, out + 2 * 64);
1658 SetAverage(DC, out + 3 * 64);
1659 } else if (sub_w <= 8) { // set block #3 to the block #2's average value
1660 DC = GetAverage(out + 2 * 64);
1661 SetAverage(DC, out + 3 * 64);
1662 }
1663 }
1664
GetReplicatedSamples(const uint8_t * rgb,int rgb_step,int sub_w,int sub_h,int w,int h)1665 const uint8_t* Encoder::GetReplicatedSamples(const uint8_t* rgb,
1666 int rgb_step,
1667 int sub_w, int sub_h,
1668 int w, int h) {
1669 assert(sub_w > 0 && sub_h > 0);
1670 if (sub_w > w) {
1671 sub_w = w;
1672 }
1673 if (sub_h > h) {
1674 sub_h = h;
1675 }
1676 uint8_t* dst = replicated_buffer_;
1677 for (int y = 0; y < sub_h; ++y) {
1678 memcpy(dst, rgb, 3 * sub_w);
1679 const uint8_t* const src0 = &dst[3 * (sub_w - 1)];
1680 for (int x = 3 * sub_w; x < 3 * w; x += 3) {
1681 memcpy(dst + x, src0, 3);
1682 }
1683 dst += 3 * w;
1684 rgb += rgb_step;
1685 }
1686 const uint8_t* dst0 = dst - 3 * w;
1687 for (int y = sub_h; y < h; ++y) {
1688 memcpy(dst, dst0, 3 * w);
1689 dst += 3 * w;
1690 }
1691 return replicated_buffer_;
1692 }
1693
1694 // TODO(skal): merge with above function? Probably slower...
GetReplicatedYUVSamples(const uint8_t * in,int step,int sub_w,int sub_h,int w,int h)1695 const uint8_t* Encoder::GetReplicatedYUVSamples(const uint8_t* in,
1696 int step,
1697 int sub_w, int sub_h,
1698 int w, int h) {
1699 assert(sub_w > 0 && sub_h > 0);
1700 if (sub_w > w) {
1701 sub_w = w;
1702 }
1703 if (sub_h > h) {
1704 sub_h = h;
1705 }
1706 uint8_t* out = replicated_buffer_;
1707 for (int y = 0; y < sub_h; ++y) {
1708 int x;
1709 for (x = 0; x < sub_w; ++x)
1710 out[x] = in[x];
1711 for (; x < w; ++x) {
1712 out[x] = out[sub_w - 1];
1713 }
1714 out += w;
1715 in += step;
1716 }
1717 const uint8_t* const out0 = out - w;
1718 for (int y = sub_h; y < h; ++y) {
1719 memcpy(out, out0, w);
1720 out += w;
1721 }
1722 return replicated_buffer_;
1723 }
1724
1725 ////////////////////////////////////////////////////////////////////////////////
1726 // sub-class for YUV 4:2:0 version
1727
1728 class Encoder420 : public Encoder {
1729 public:
Encoder420(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1730 Encoder420(int W, int H, int step, const uint8_t* const rgb,
1731 ByteSink* const sink)
1732 : Encoder(W, H, step, rgb, sink) {}
~Encoder420()1733 virtual ~Encoder420() {}
InitComponents()1734 virtual void InitComponents() {
1735 nb_comps_ = 3;
1736
1737 quant_idx_[0] = 0;
1738 quant_idx_[1] = 1;
1739 quant_idx_[2] = 1;
1740
1741 nb_blocks_[0] = 4;
1742 nb_blocks_[1] = 1;
1743 nb_blocks_[2] = 1;
1744 mcu_blocks_ = 6;
1745
1746 block_w_ = 16;
1747 block_h_ = 16;
1748 block_dims_[0] = 0x22;
1749 block_dims_[1] = 0x11;
1750 block_dims_[2] = 0x11;
1751 }
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out_blocks)1752 virtual void GetSamples(int mb_x, int mb_y, bool clipped,
1753 int16_t* out_blocks) {
1754 const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 16;
1755 int step = step_;
1756 if (clipped) {
1757 data = GetReplicatedSamples(data, step,
1758 W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1759 step = 3 * 16;
1760 }
1761 get_yuv_block_(data, step, out_blocks);
1762 if (clipped) {
1763 AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out_blocks);
1764 }
1765 }
1766 };
1767
1768 ////////////////////////////////////////////////////////////////////////////////
1769 // sub-class for YUV 4:4:4 version
1770
1771 class Encoder444 : public Encoder {
1772 public:
Encoder444(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1773 Encoder444(int W, int H, int step, const uint8_t* const rgb,
1774 ByteSink* const sink)
1775 : Encoder(W, H, step, rgb, sink) {
1776 SetYUVFormat(true);
1777 }
~Encoder444()1778 virtual ~Encoder444() {}
InitComponents()1779 virtual void InitComponents() {
1780 nb_comps_ = 3;
1781
1782 quant_idx_[0] = 0;
1783 quant_idx_[1] = 1;
1784 quant_idx_[2] = 1;
1785
1786 nb_blocks_[0] = 1;
1787 nb_blocks_[1] = 1;
1788 nb_blocks_[2] = 1;
1789 mcu_blocks_ = 3;
1790
1791 block_w_ = 8;
1792 block_h_ = 8;
1793 block_dims_[0] = 0x11;
1794 block_dims_[1] = 0x11;
1795 block_dims_[2] = 0x11;
1796 }
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1797 virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1798 const uint8_t* data = rgb_ + (3 * mb_x + mb_y * step_) * 8;
1799 int step = step_;
1800 if (clipped) {
1801 data = GetReplicatedSamples(data, step,
1802 W_ - mb_x * 8, H_ - mb_y * 8, 8, 8);
1803 step = 3 * 8;
1804 }
1805 get_yuv_block_(data, step, out);
1806 }
1807 };
1808
1809 ////////////////////////////////////////////////////////////////////////////////
1810 // sub-class for the sharp YUV 4:2:0 version
1811
1812 class EncoderSharp420 : public Encoder420 {
1813 public:
EncoderSharp420(int W,int H,int step,const uint8_t * const rgb,ByteSink * const sink)1814 EncoderSharp420(int W, int H, int step, const uint8_t* const rgb,
1815 ByteSink* const sink)
1816 : Encoder420(W, H, step, rgb, sink), yuv_memory_(nullptr) {
1817 const int uv_w = (W + 1) >> 1;
1818 const int uv_h = (H + 1) >> 1;
1819 yuv_memory_ = Alloc<uint8_t>(W * H + 2 * uv_w * uv_h);
1820 if (yuv_memory_ == nullptr) return;
1821 y_plane_ = yuv_memory_;
1822 y_step_ = W;
1823 u_plane_ = yuv_memory_ + W * H;
1824 v_plane_ = u_plane_ + uv_w * uv_h;
1825 uv_step_ = uv_w;
1826 ApplySharpYUVConversion(rgb, W, H, step, y_plane_, u_plane_, v_plane_);
1827 }
~EncoderSharp420()1828 virtual ~EncoderSharp420() { Free(yuv_memory_); }
1829 virtual void GetSamples(int mb_x, int mb_y, bool clipped, int16_t* out);
1830
1831 protected:
GetLumaSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1832 void GetLumaSamples(int mb_x, int mb_y, bool clipped, int16_t* out) {
1833 int step = y_step_;
1834 const uint8_t* Y1 = y_plane_ + (mb_x + mb_y * step) * 16;
1835 if (clipped) {
1836 Y1 = GetReplicatedYUVSamples(Y1, step,
1837 W_ - mb_x * 16, H_ - mb_y * 16, 16, 16);
1838 step = 16;
1839 }
1840 const uint8_t* Y2 = Y1 + 8 * step;
1841 for (int y = 8, n = 0; y > 0; --y) {
1842 for (int x = 0; x < 8; ++x, ++n) {
1843 out[n + 0 * 64] = Y1[x] - 128;
1844 out[n + 1 * 64] = Y1[x + 8] - 128;
1845 out[n + 2 * 64] = Y2[x] - 128;
1846 out[n + 3 * 64] = Y2[x + 8] - 128;
1847 }
1848 Y1 += step;
1849 Y2 += step;
1850 }
1851 if (clipped) {
1852 AverageExtraLuma(W_ - mb_x * 16, H_ - mb_y * 16, out);
1853 }
1854 }
1855
1856 private:
1857 uint8_t* y_plane_;
1858 int y_step_;
1859 uint8_t* u_plane_;
1860 uint8_t* v_plane_;
1861 int uv_step_;
1862 uint8_t* yuv_memory_;
1863 };
1864
GetSamples(int mb_x,int mb_y,bool clipped,int16_t * out)1865 void EncoderSharp420::GetSamples(int mb_x, int mb_y,
1866 bool clipped, int16_t* out) {
1867 GetLumaSamples(mb_x, mb_y, clipped, out);
1868
1869 // Chroma
1870 const uint8_t* U = u_plane_ + (mb_x + mb_y * uv_step_) * 8;
1871 int step = uv_step_;
1872 if (clipped) {
1873 U = GetReplicatedYUVSamples(U, step,
1874 ((W_ + 1) >> 1) - mb_x * 8,
1875 ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1876 step = 8;
1877 }
1878 for (int y = 8, n = 0; y > 0; --y, U += step) {
1879 for (int x = 0; x < 8; ++x, ++n) {
1880 out[n + 4 * 64] = U[x] - 128;
1881 }
1882 }
1883 const uint8_t* V = v_plane_ + (mb_x + mb_y * uv_step_) * 8;
1884 step = uv_step_;
1885 if (clipped) {
1886 V = GetReplicatedYUVSamples(V, step,
1887 ((W_ + 1) >> 1) - mb_x * 8,
1888 ((H_ + 1) >> 1) - mb_y * 8, 8, 8);
1889 step = 8;
1890 }
1891 for (int y = 8, n = 0; y > 0; --y, V += step) {
1892 for (int x = 0; x < 8; ++x, ++n) {
1893 out[n + 5 * 64] = V[x] - 128;
1894 }
1895 }
1896 }
1897
1898 ////////////////////////////////////////////////////////////////////////////////
1899 // all-in-one factory to pickup the right encoder instance
1900
EncoderFactory(const uint8_t * rgb,int W,int H,int stride,SjpegYUVMode yuv_mode,ByteSink * const sink)1901 Encoder* EncoderFactory(const uint8_t* rgb,
1902 int W, int H, int stride, SjpegYUVMode yuv_mode,
1903 ByteSink* const sink) {
1904 if (yuv_mode == SJPEG_YUV_AUTO) {
1905 yuv_mode = SjpegRiskiness(rgb, W, H, stride, nullptr);
1906 }
1907
1908 Encoder* enc = nullptr;
1909 if (yuv_mode == SJPEG_YUV_420) {
1910 enc = new (std::nothrow) Encoder420(W, H, stride, rgb, sink);
1911 } else if (yuv_mode == SJPEG_YUV_SHARP) {
1912 enc = new (std::nothrow) EncoderSharp420(W, H, stride, rgb, sink);
1913 } else {
1914 enc = new (std::nothrow) Encoder444(W, H, stride, rgb, sink);
1915 }
1916 if (enc == nullptr || !enc->Ok()) {
1917 delete enc;
1918 enc = nullptr;
1919 }
1920 return enc;
1921 }
1922
1923 } // namespace sjpeg
1924
1925 ////////////////////////////////////////////////////////////////////////////////
1926 // public plain-C functions
1927
SjpegEncode(const uint8_t * rgb,int width,int height,int stride,uint8_t ** out_data,float quality,int method,SjpegYUVMode yuv_mode)1928 size_t SjpegEncode(const uint8_t* rgb, int width, int height, int stride,
1929 uint8_t** out_data, float quality, int method,
1930 SjpegYUVMode yuv_mode) {
1931 if (rgb == nullptr || out_data == nullptr) return 0;
1932 if (width <= 0 || height <= 0 || stride < 3 * width) return 0;
1933 *out_data = nullptr; // safety
1934
1935 MemorySink sink(width * height / 4);
1936 Encoder* const enc = EncoderFactory(rgb, width, height, stride, yuv_mode,
1937 &sink);
1938 enc->SetQuality(quality);
1939 enc->SetCompressionMethod(method);
1940 size_t size = 0;
1941 *out_data = nullptr;
1942 if (enc->Encode()) sink.Release(out_data, &size);
1943 delete enc;
1944 return size;
1945 }
1946
1947 ////////////////////////////////////////////////////////////////////////////////
1948
SjpegCompress(const uint8_t * rgb,int width,int height,float quality,uint8_t ** out_data)1949 size_t SjpegCompress(const uint8_t* rgb, int width, int height, float quality,
1950 uint8_t** out_data) {
1951 return SjpegEncode(rgb, width, height, 3 * width, out_data,
1952 quality, 4, SJPEG_YUV_AUTO);
1953 }
1954
SjpegFreeBuffer(const uint8_t * buffer)1955 void SjpegFreeBuffer(const uint8_t* buffer) {
1956 delete[] buffer;
1957 }
1958
1959 ////////////////////////////////////////////////////////////////////////////////
1960
SjpegVersion()1961 uint32_t SjpegVersion() {
1962 return SJPEG_VERSION;
1963 }
1964
1965 ////////////////////////////////////////////////////////////////////////////////
1966 // Parametrized call
1967
EncoderParam()1968 EncoderParam::EncoderParam() : search_hook(nullptr), memory(nullptr) {
1969 Init(kDefaultQuality);
1970 }
1971
EncoderParam(float quality_factor)1972 EncoderParam::EncoderParam(float quality_factor)
1973 : search_hook(nullptr), memory(nullptr) {
1974 Init(quality_factor);
1975 }
1976
Init(float quality_factor)1977 void EncoderParam::Init(float quality_factor) {
1978 Huffman_compress = true;
1979 adaptive_quantization = true;
1980 use_trellis = false;
1981 yuv_mode = SJPEG_YUV_AUTO;
1982 quantization_bias = kDefaultBias;
1983 qdelta_max_luma = kDefaultDeltaMaxLuma;
1984 qdelta_max_chroma = kDefaultDeltaMaxChroma;
1985 adaptive_bias = false;
1986 SetLimitQuantization(false);
1987 min_quant_tolerance_ = 0;
1988 SetQuality(quality_factor);
1989 target_mode = TARGET_NONE;
1990 target_value = 0;
1991 passes = 1;
1992 tolerance = 1.;
1993 qmin = 0.;
1994 qmax = 100.;
1995 }
1996
SetQuality(float quality_factor)1997 void EncoderParam::SetQuality(float quality_factor) {
1998 const float q = GetQFactor(quality_factor);
1999 sjpeg::SetQuantMatrix(kDefaultMatrices[0], q, quant_[0]);
2000 sjpeg::SetQuantMatrix(kDefaultMatrices[1], q, quant_[1]);
2001 }
2002
SetQuantization(const uint8_t m[2][64],float reduction)2003 void EncoderParam::SetQuantization(const uint8_t m[2][64],
2004 float reduction) {
2005 if (reduction <= 1.f) reduction = 1.f;
2006 if (m == nullptr) return;
2007 for (int c = 0; c < 2; ++c) {
2008 for (size_t i = 0; i < 64; ++i) {
2009 const int v = static_cast<int>(m[c][i] * 100. / reduction + .5);
2010 quant_[c][i] = (v > 255) ? 255u : (v < 1) ? 1u : v;
2011 }
2012 }
2013 }
2014
SetLimitQuantization(bool limit_quantization,int min_quant_tolerance)2015 void EncoderParam::SetLimitQuantization(bool limit_quantization,
2016 int min_quant_tolerance) {
2017 use_min_quant_ = limit_quantization;
2018 if (limit_quantization) SetMinQuantization(quant_, min_quant_tolerance);
2019 }
2020
SetMinQuantization(const uint8_t m[2][64],int min_quant_tolerance)2021 void EncoderParam::SetMinQuantization(const uint8_t m[2][64],
2022 int min_quant_tolerance) {
2023 use_min_quant_ = true;
2024 CopyQuantMatrix(m[0], min_quant_[0]);
2025 CopyQuantMatrix(m[1], min_quant_[1]);
2026 min_quant_tolerance_ = (min_quant_tolerance < 0) ? 0
2027 : (min_quant_tolerance > 100) ? 100
2028 : min_quant_tolerance;
2029 }
2030
ResetMetadata()2031 void EncoderParam::ResetMetadata() {
2032 iccp.clear();
2033 exif.clear();
2034 xmp.clear();
2035 app_markers.clear();
2036 }
2037
InitFromParam(const EncoderParam & param)2038 bool Encoder::InitFromParam(const EncoderParam& param) {
2039 SetQuantMatrices(param.quant_);
2040 if (param.use_min_quant_) {
2041 SetMinQuantMatrices(param.min_quant_, param.min_quant_tolerance_);
2042 } else {
2043 SetDefaultMinQuantMatrices();
2044 }
2045
2046 int method = param.Huffman_compress ? 1 : 0;
2047 if (param.adaptive_quantization) method += 3;
2048 if (param.use_trellis) {
2049 method = (method == 4) ? 7 : (method == 6) ? 8 : method;
2050 }
2051
2052 SetCompressionMethod(method);
2053 SetQuantizationBias(param.quantization_bias, param.adaptive_bias);
2054 SetQuantizationDeltas(param.qdelta_max_luma, param.qdelta_max_chroma);
2055
2056 SetMetadata(param.iccp, Encoder::ICC);
2057 SetMetadata(param.exif, Encoder::EXIF);
2058 SetMetadata(param.xmp, Encoder::XMP);
2059 SetMetadata(param.app_markers, Encoder::MARKERS);
2060
2061 passes_ = (param.passes < 1) ? 1 : (param.passes > 20) ? 20 : param.passes;
2062 if (passes_ > 1) {
2063 use_extra_memory_ = true;
2064 reuse_run_levels_ = true;
2065 search_hook_ = (param.search_hook == nullptr) ? &default_hook_
2066 : param.search_hook;
2067 if (!search_hook_->Setup(param)) return false;
2068 }
2069
2070 memory_hook_ = (param.memory == nullptr) ? &kDefaultMemory : param.memory;
2071 return true;
2072 }
2073
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,ByteSink * sink)2074 bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2075 const EncoderParam& param, ByteSink* sink) {
2076 if (rgb == nullptr || sink == nullptr) return false;
2077 if (width <= 0 || height <= 0 || stride < 3 * width) return false;
2078
2079 Encoder* const enc = EncoderFactory(rgb, width, height, stride,
2080 param.yuv_mode, sink);
2081 const bool ok = (enc != nullptr) &&
2082 enc->InitFromParam(param) &&
2083 enc->Encode();
2084 delete enc;
2085 return ok;
2086 }
2087
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,uint8_t ** out_data)2088 size_t sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2089 const EncoderParam& param, uint8_t** out_data) {
2090 MemorySink sink(width * height / 4); // estimation of output size
2091 if (!sjpeg::Encode(rgb, width, height, stride, param, &sink)) return 0;
2092 size_t size;
2093 sink.Release(out_data, &size);
2094 return size;
2095 }
2096
2097 ////////////////////////////////////////////////////////////////////////////////
2098 // std::string variants
2099
Encode(const uint8_t * rgb,int width,int height,int stride,const EncoderParam & param,std::string * output)2100 bool sjpeg::Encode(const uint8_t* rgb, int width, int height, int stride,
2101 const EncoderParam& param, std::string* output) {
2102 if (output == nullptr) return false;
2103 output->clear();
2104 output->reserve(width * height / 4);
2105 StringSink sink(output);
2106 return Encode(rgb, width, height, stride, param, &sink);
2107 }
2108
SjpegCompress(const uint8_t * rgb,int width,int height,float quality,std::string * output)2109 bool SjpegCompress(const uint8_t* rgb, int width, int height,
2110 float quality, std::string* output) {
2111 EncoderParam param;
2112 param.SetQuality(quality);
2113 return Encode(rgb, width, height, 3 * width, param, output);
2114 }
2115
2116 ////////////////////////////////////////////////////////////////////////////////
2117
SjpegDimensions(const std::string & jpeg_data,int * width,int * height,int * is_yuv420)2118 bool SjpegDimensions(const std::string& jpeg_data,
2119 int* width, int* height, int* is_yuv420) {
2120 return SjpegDimensions(
2121 reinterpret_cast<const uint8_t*>(jpeg_data.data()),
2122 jpeg_data.size(), width, height, is_yuv420);
2123 }
2124
SjpegFindQuantizer(const std::string & jpeg_data,uint8_t quant[2][64])2125 int SjpegFindQuantizer(const std::string& jpeg_data,
2126 uint8_t quant[2][64]) {
2127 return SjpegFindQuantizer(
2128 reinterpret_cast<const uint8_t*>(jpeg_data.data()), jpeg_data.size(),
2129 quant);
2130 }
2131
2132 ////////////////////////////////////////////////////////////////////////////////
2133