1 /*
2  * jcdctmgr.c
3  *
4  * This file was part of the Independent JPEG Group's software:
5  * Copyright (C) 1994-1996, Thomas G. Lane.
6  * libjpeg-turbo Modifications:
7  * Copyright (C) 1999-2006, MIYASAKA Masaru.
8  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
9  * Copyright (C) 2011, 2014-2015 D. R. Commander
10  * mozjpeg Modifications:
11  * Copyright (C) 2014, Mozilla Corporation.
12  * For conditions of distribution and use, see the accompanying README file.
13  *
14  * This file contains the forward-DCT management logic.
15  * This code selects a particular DCT implementation to be used,
16  * and it performs related housekeeping chores including coefficient
17  * quantization.
18  */
19 
20 #define JPEG_INTERNALS
21 #include "jinclude.h"
22 #include "jpeglib.h"
23 #include "jdct.h"               /* Private declarations for DCT subsystem */
24 #include "jsimddct.h"
25 #include "jchuff.h"
26 #include <assert.h>
27 #include <math.h>
28 
29 
30 /* Private subobject for this module */
31 
32 typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
33 typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
34 
35 typedef void (*preprocess_method_ptr)(DCTELEM*, const JQUANT_TBL*);
36 typedef void (*float_preprocess_method_ptr)(FAST_FLOAT*, const JQUANT_TBL*);
37 
38 typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
39                                      JDIMENSION start_col,
40                                      DCTELEM *workspace);
41 typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
42                                            JDIMENSION start_col,
43                                            FAST_FLOAT *workspace);
44 
45 typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
46                                      DCTELEM *workspace);
47 typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
48                                            FAST_FLOAT *divisors,
49                                            FAST_FLOAT *workspace);
50 
51 METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
52 
53 typedef struct {
54   struct jpeg_forward_dct pub;  /* public fields */
55 
56   /* Pointer to the DCT routine actually in use */
57   forward_DCT_method_ptr dct;
58   convsamp_method_ptr convsamp;
59   preprocess_method_ptr preprocess;
60   quantize_method_ptr quantize;
61 
62   /* The actual post-DCT divisors --- not identical to the quant table
63    * entries, because of scaling (especially for an unnormalized DCT).
64    * Each table is given in normal array order.
65    */
66   DCTELEM *divisors[NUM_QUANT_TBLS];
67 
68   /* work area for FDCT subroutine */
69   DCTELEM *workspace;
70 
71 #ifdef DCT_FLOAT_SUPPORTED
72   /* Same as above for the floating-point case. */
73   float_DCT_method_ptr float_dct;
74   float_convsamp_method_ptr float_convsamp;
75   float_preprocess_method_ptr float_preprocess;
76   float_quantize_method_ptr float_quantize;
77   FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
78   FAST_FLOAT *float_workspace;
79 #endif
80 } my_fdct_controller;
81 
82 typedef my_fdct_controller *my_fdct_ptr;
83 
84 
85 #if BITS_IN_JSAMPLE == 8
86 
87 /*
88  * Find the highest bit in an integer through binary search.
89  */
90 
91 LOCAL(int)
flss(UINT16 val)92 flss (UINT16 val)
93 {
94   int bit;
95 
96   bit = 16;
97 
98   if (!val)
99     return 0;
100 
101   if (!(val & 0xff00)) {
102     bit -= 8;
103     val <<= 8;
104   }
105   if (!(val & 0xf000)) {
106     bit -= 4;
107     val <<= 4;
108   }
109   if (!(val & 0xc000)) {
110     bit -= 2;
111     val <<= 2;
112   }
113   if (!(val & 0x8000)) {
114     bit -= 1;
115     val <<= 1;
116   }
117 
118   return bit;
119 }
120 
121 
122 /*
123  * Compute values to do a division using reciprocal.
124  *
125  * This implementation is based on an algorithm described in
126  *   "How to optimize for the Pentium family of microprocessors"
127  *   (http://www.agner.org/assem/).
128  * More information about the basic algorithm can be found in
129  * the paper "Integer Division Using Reciprocals" by Robert Alverson.
130  *
131  * The basic idea is to replace x/d by x * d^-1. In order to store
132  * d^-1 with enough precision we shift it left a few places. It turns
133  * out that this algoright gives just enough precision, and also fits
134  * into DCTELEM:
135  *
136  *   b = (the number of significant bits in divisor) - 1
137  *   r = (word size) + b
138  *   f = 2^r / divisor
139  *
140  * f will not be an integer for most cases, so we need to compensate
141  * for the rounding error introduced:
142  *
143  *   no fractional part:
144  *
145  *       result = input >> r
146  *
147  *   fractional part of f < 0.5:
148  *
149  *       round f down to nearest integer
150  *       result = ((input + 1) * f) >> r
151  *
152  *   fractional part of f > 0.5:
153  *
154  *       round f up to nearest integer
155  *       result = (input * f) >> r
156  *
157  * This is the original algorithm that gives truncated results. But we
158  * want properly rounded results, so we replace "input" with
159  * "input + divisor/2".
160  *
161  * In order to allow SIMD implementations we also tweak the values to
162  * allow the same calculation to be made at all times:
163  *
164  *   dctbl[0] = f rounded to nearest integer
165  *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
166  *   dctbl[2] = 1 << ((word size) * 2 - r)
167  *   dctbl[3] = r - (word size)
168  *
169  * dctbl[2] is for stupid instruction sets where the shift operation
170  * isn't member wise (e.g. MMX).
171  *
172  * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
173  * is that most SIMD implementations have a "multiply and store top
174  * half" operation.
175  *
176  * Lastly, we store each of the values in their own table instead
177  * of in a consecutive manner, yet again in order to allow SIMD
178  * routines.
179  */
180 
181 LOCAL(int)
compute_reciprocal(UINT16 divisor,DCTELEM * dtbl)182 compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
183 {
184   UDCTELEM2 fq, fr;
185   UDCTELEM c;
186   int b, r;
187 
188   if (divisor == 1) {
189     /* divisor == 1 means unquantized, so these reciprocal/correction/shift
190      * values will cause the C quantization algorithm to act like the
191      * identity function.  Since only the C quantization algorithm is used in
192      * these cases, the scale value is irrelevant.
193      */
194     dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
195     dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
196     dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
197     dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
198     return 0;
199   }
200 
201   b = flss(divisor) - 1;
202   r  = sizeof(DCTELEM) * 8 + b;
203 
204   fq = ((UDCTELEM2)1 << r) / divisor;
205   fr = ((UDCTELEM2)1 << r) % divisor;
206 
207   c = divisor / 2; /* for rounding */
208 
209   if (fr == 0) { /* divisor is power of two */
210     /* fq will be one bit too large to fit in DCTELEM, so adjust */
211     fq >>= 1;
212     r--;
213   } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
214     c++;
215   } else { /* fractional part is > 0.5 */
216     fq++;
217   }
218 
219   dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
220   dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
221 #ifdef WITH_SIMD
222   dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
223 #else
224   dtbl[DCTSIZE2 * 2] = 1;
225 #endif
226   dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
227 
228   if (r <= 16) return 0;
229   else return 1;
230 }
231 
232 #endif
233 
234 
235 /*
236  * Initialize for a processing pass.
237  * Verify that all referenced Q-tables are present, and set up
238  * the divisor table for each one.
239  * In the current implementation, DCT of all components is done during
240  * the first pass, even if only some components will be output in the
241  * first scan.  Hence all components should be examined here.
242  */
243 
244 METHODDEF(void)
start_pass_fdctmgr(j_compress_ptr cinfo)245 start_pass_fdctmgr (j_compress_ptr cinfo)
246 {
247   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
248   int ci, qtblno, i;
249   jpeg_component_info *compptr;
250   JQUANT_TBL *qtbl;
251   DCTELEM *dtbl;
252 
253   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
254        ci++, compptr++) {
255     qtblno = compptr->quant_tbl_no;
256     /* Make sure specified quantization table is present */
257     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
258         cinfo->quant_tbl_ptrs[qtblno] == NULL)
259       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
260     qtbl = cinfo->quant_tbl_ptrs[qtblno];
261     /* Compute divisors for this quant table */
262     /* We may do this more than once for same table, but it's not a big deal */
263     switch (cinfo->dct_method) {
264 #ifdef DCT_ISLOW_SUPPORTED
265     case JDCT_ISLOW:
266       /* For LL&M IDCT method, divisors are equal to raw quantization
267        * coefficients multiplied by 8 (to counteract scaling).
268        */
269       if (fdct->divisors[qtblno] == NULL) {
270         fdct->divisors[qtblno] = (DCTELEM *)
271           (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
272                                       (DCTSIZE2 * 4) * sizeof(DCTELEM));
273       }
274       dtbl = fdct->divisors[qtblno];
275       for (i = 0; i < DCTSIZE2; i++) {
276 #if BITS_IN_JSAMPLE == 8
277         if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
278             fdct->quantize == jsimd_quantize)
279           fdct->quantize = quantize;
280 #else
281         dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
282 #endif
283       }
284       break;
285 #endif
286 #ifdef DCT_IFAST_SUPPORTED
287     case JDCT_IFAST:
288       {
289         /* For AA&N IDCT method, divisors are equal to quantization
290          * coefficients scaled by scalefactor[row]*scalefactor[col], where
291          *   scalefactor[0] = 1
292          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
293          * We apply a further scale factor of 8.
294          */
295 #define CONST_BITS 14
296         static const INT16 aanscales[DCTSIZE2] = {
297           /* precomputed values scaled up by 14 bits */
298           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
299           22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
300           21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
301           19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
302           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
303           12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
304            8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
305            4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
306         };
307         SHIFT_TEMPS
308 
309         if (fdct->divisors[qtblno] == NULL) {
310           fdct->divisors[qtblno] = (DCTELEM *)
311             (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
312                                         (DCTSIZE2 * 4) * sizeof(DCTELEM));
313         }
314         dtbl = fdct->divisors[qtblno];
315         for (i = 0; i < DCTSIZE2; i++) {
316 #if BITS_IN_JSAMPLE == 8
317           if (!compute_reciprocal(
318                 DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
319                                       (JLONG) aanscales[i]),
320                         CONST_BITS-3), &dtbl[i]) &&
321               fdct->quantize == jsimd_quantize)
322             fdct->quantize = quantize;
323 #else
324            dtbl[i] = (DCTELEM)
325              DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
326                                    (JLONG) aanscales[i]),
327                      CONST_BITS-3);
328 #endif
329         }
330       }
331       break;
332 #endif
333 #ifdef DCT_FLOAT_SUPPORTED
334     case JDCT_FLOAT:
335       {
336         /* For float AA&N IDCT method, divisors are equal to quantization
337          * coefficients scaled by scalefactor[row]*scalefactor[col], where
338          *   scalefactor[0] = 1
339          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
340          * We apply a further scale factor of 8.
341          * What's actually stored is 1/divisor so that the inner loop can
342          * use a multiplication rather than a division.
343          */
344         FAST_FLOAT *fdtbl;
345         int row, col;
346         static const double aanscalefactor[DCTSIZE] = {
347           1.0, 1.387039845, 1.306562965, 1.175875602,
348           1.0, 0.785694958, 0.541196100, 0.275899379
349         };
350 
351         if (fdct->float_divisors[qtblno] == NULL) {
352           fdct->float_divisors[qtblno] = (FAST_FLOAT *)
353             (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
354                                         DCTSIZE2 * sizeof(FAST_FLOAT));
355         }
356         fdtbl = fdct->float_divisors[qtblno];
357         i = 0;
358         for (row = 0; row < DCTSIZE; row++) {
359           for (col = 0; col < DCTSIZE; col++) {
360             fdtbl[i] = (FAST_FLOAT)
361               (1.0 / (((double) qtbl->quantval[i] *
362                        aanscalefactor[row] * aanscalefactor[col] * 8.0)));
363             i++;
364           }
365         }
366       }
367       break;
368 #endif
369     default:
370       ERREXIT(cinfo, JERR_NOT_COMPILED);
371       break;
372     }
373   }
374 }
375 
376 METHODDEF(float)
catmull_rom(const DCTELEM value1,const DCTELEM value2,const DCTELEM value3,const DCTELEM value4,const float t,int size)377 catmull_rom(const DCTELEM value1, const DCTELEM value2, const DCTELEM value3, const DCTELEM value4, const float t, int size)
378 {
379   const int tan1 = (value3 - value1) * size;
380   const int tan2 = (value4 - value2) * size;
381 
382   const float t2 = t * t;
383   const float t3 = t2 * t;
384 
385   const float f1 = 2.f * t3 - 3.f * t2 + 1.f;
386   const float f2 = -2.f * t3 + 3.f * t2;
387   const float f3 = t3 - 2.f * t2 + t;
388   const float f4 = t3 - t2;
389 
390   return value2 * f1 + tan1 * f3 +
391          value3 * f2 + tan2 * f4;
392 }
393 
394 /** Prevents visible ringing artifacts near hard edges on white backgrounds.
395 
396   1. JPEG can encode samples with higher values than it's possible to display (higher than 255 in RGB),
397      and the decoder will always clamp values to 0-255. To encode 255 you can use any value >= 255,
398      and distortions of the out-of-range values won't be visible as long as they decode to anything >= 255.
399 
400   2. From DCT perspective pixels in a block are a waveform. Hard edges form square waves (bad).
401      Edges with white are similar to waveform clipping, and anti-clipping algorithms can turn square waves
402      into softer ones that compress better.
403 
404  */
405 METHODDEF(void)
preprocess_deringing(DCTELEM * data,const JQUANT_TBL * quantization_table)406 preprocess_deringing(DCTELEM *data, const JQUANT_TBL *quantization_table)
407 {
408   const DCTELEM maxsample = 255 - CENTERJSAMPLE;
409   const int size = DCTSIZE * DCTSIZE;
410 
411   /* Decoders don't handle overflow of DC very well, so calculate
412      maximum overflow that is safe to do without increasing DC out of range */
413   int sum = 0;
414   int maxsample_count = 0;
415   int i;
416   DCTELEM maxovershoot;
417   int n;
418 
419   for(i=0; i < size; i++) {
420     sum += data[i];
421     if (data[i] >= maxsample) {
422       maxsample_count++;
423     }
424   }
425 
426   /* If nothing reaches max value there's nothing to overshoot
427      and if the block is completely flat, it's already the best case. */
428   if (!maxsample_count || maxsample_count == size) {
429     return;
430   }
431 
432   /* Too much overshoot is not good: increased amplitude will cost bits, and the cost is proportional to quantization (here using DC quant as a rough guide). */
433   maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count);
434 
435   n = 0;
436   do {
437     int start, end, length;
438     DCTELEM f1, f2, l1, l2, fslope, lslope;
439     float step, position;
440 
441     /* Pixels are traversed in zig-zag order to process them as a line */
442     if (data[jpeg_natural_order[n]] < maxsample) {
443       n++;
444       continue;
445     }
446 
447     /* Find a run of maxsample pixels. Start is the first pixel inside the range, end the first pixel outside. */
448     start = n;
449     while(++n < size && data[jpeg_natural_order[n]] >= maxsample) {}
450     end = n;
451 
452     /* the run will be replaced with a catmull-rom interpolation of values from the edges */
453 
454     /* Find suitable upward slope from pixels around edges of the run.
455        Just feeding nearby pixels as catmull rom points isn't good enough,
456        as slope with one sample before the edge may have been flattened by clipping,
457        and slope of two samples before the edge could be downward. */
458     f1 = data[jpeg_natural_order[start >= 1 ? start-1 : 0]];
459     f2 = data[jpeg_natural_order[start >= 2 ? start-2 : 0]];
460 
461     l1 = data[jpeg_natural_order[end < size-1 ? end : size-1]];
462     l2 = data[jpeg_natural_order[end < size-2 ? end+1 : size-1]];
463 
464     fslope = MAX(f1-f2, maxsample-f1);
465     lslope = MAX(l1-l2, maxsample-l1);
466 
467     /* if slope at the start/end is unknown, just make the curve symmetric */
468     if (start == 0) {
469       fslope = lslope;
470     }
471     if (end == size) {
472       lslope = fslope;
473     }
474 
475     /* The curve fits better if first and last pixel is omitted */
476     length = end - start;
477     step = 1.f/(float)(length + 1);
478     position = step;
479 
480     for(i = start; i < end; i++, position += step) {
481       DCTELEM tmp = ceilf(catmull_rom(maxsample - fslope, maxsample, maxsample, maxsample - lslope, position, length));
482       data[jpeg_natural_order[i]] = MIN(tmp, maxovershoot);
483     }
484     n++;
485   }
486   while(n < size);
487 }
488 
489 /*
490   Float version of preprocess_deringing()
491  */
492 METHODDEF(void)
float_preprocess_deringing(FAST_FLOAT * data,const JQUANT_TBL * quantization_table)493 float_preprocess_deringing(FAST_FLOAT *data, const JQUANT_TBL *quantization_table)
494 {
495   const FAST_FLOAT maxsample = 255 - CENTERJSAMPLE;
496   const int size = DCTSIZE * DCTSIZE;
497 
498   FAST_FLOAT sum = 0;
499   int maxsample_count = 0;
500   int i;
501   int n;
502   FAST_FLOAT maxovershoot;
503 
504   for(i=0; i < size; i++) {
505     sum += data[i];
506     if (data[i] >= maxsample) {
507       maxsample_count++;
508     }
509   }
510 
511   if (!maxsample_count || maxsample_count == size) {
512     return;
513   }
514 
515   maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count);
516 
517   n = 0;
518   do {
519     int start, end, length;
520     FAST_FLOAT f1, f2, l1, l2, fslope, lslope;
521     float step, position;
522 
523     if (data[jpeg_natural_order[n]] < maxsample) {
524       n++;
525       continue;
526     }
527 
528     start = n;
529     while(++n < size && data[jpeg_natural_order[n]] >= maxsample) {}
530     end = n;
531 
532     f1 = data[jpeg_natural_order[start >= 1 ? start-1 : 0]];
533     f2 = data[jpeg_natural_order[start >= 2 ? start-2 : 0]];
534 
535     l1 = data[jpeg_natural_order[end < size-1 ? end : size-1]];
536     l2 = data[jpeg_natural_order[end < size-2 ? end+1 : size-1]];
537 
538     fslope = MAX(f1-f2, maxsample-f1);
539     lslope = MAX(l1-l2, maxsample-l1);
540 
541     if (start == 0) {
542       fslope = lslope;
543     }
544     if (end == size) {
545       lslope = fslope;
546     }
547 
548     length = end - start;
549     step = 1.f/(float)(length + 1);
550     position = step;
551 
552     for(i = start; i < end; i++, position += step) {
553       FAST_FLOAT tmp = catmull_rom(maxsample - fslope, maxsample, maxsample, maxsample - lslope, position, length);
554       data[jpeg_natural_order[i]] = MIN(tmp, maxovershoot);
555     }
556     n++;
557   }
558   while(n < size);
559 }
560 
561 /*
562  * Load data into workspace, applying unsigned->signed conversion.
563  */
564 
565 METHODDEF(void)
convsamp(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)566 convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
567 {
568   register DCTELEM *workspaceptr;
569   register JSAMPROW elemptr;
570   register int elemr;
571 
572   workspaceptr = workspace;
573   for (elemr = 0; elemr < DCTSIZE; elemr++) {
574     elemptr = sample_data[elemr] + start_col;
575 
576 #if DCTSIZE == 8                /* unroll the inner loop */
577     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
578     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
579     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
580     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
581     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
582     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
583     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
584     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
585 #else
586     {
587       register int elemc;
588       for (elemc = DCTSIZE; elemc > 0; elemc--)
589         *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
590     }
591 #endif
592   }
593 }
594 
595 
596 /*
597  * Quantize/descale the coefficients, and store into coef_blocks[].
598  */
599 
600 METHODDEF(void)
quantize(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)601 quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
602 {
603   int i;
604   DCTELEM temp;
605   JCOEFPTR output_ptr = coef_block;
606 
607 #if BITS_IN_JSAMPLE == 8
608 
609   UDCTELEM recip, corr;
610   int shift;
611   UDCTELEM2 product;
612 
613   for (i = 0; i < DCTSIZE2; i++) {
614     temp = workspace[i];
615     recip = divisors[i + DCTSIZE2 * 0];
616     corr =  divisors[i + DCTSIZE2 * 1];
617     shift = divisors[i + DCTSIZE2 * 3];
618 
619     if (temp < 0) {
620       temp = -temp;
621       product = (UDCTELEM2)(temp + corr) * recip;
622       product >>= shift + sizeof(DCTELEM)*8;
623       temp = (DCTELEM)product;
624       temp = -temp;
625     } else {
626       product = (UDCTELEM2)(temp + corr) * recip;
627       product >>= shift + sizeof(DCTELEM)*8;
628       temp = (DCTELEM)product;
629     }
630     output_ptr[i] = (JCOEF) temp;
631   }
632 
633 #else
634 
635   register DCTELEM qval;
636 
637   for (i = 0; i < DCTSIZE2; i++) {
638     qval = divisors[i];
639     temp = workspace[i];
640     /* Divide the coefficient value by qval, ensuring proper rounding.
641      * Since C does not specify the direction of rounding for negative
642      * quotients, we have to force the dividend positive for portability.
643      *
644      * In most files, at least half of the output values will be zero
645      * (at default quantization settings, more like three-quarters...)
646      * so we should ensure that this case is fast.  On many machines,
647      * a comparison is enough cheaper than a divide to make a special test
648      * a win.  Since both inputs will be nonnegative, we need only test
649      * for a < b to discover whether a/b is 0.
650      * If your machine's division is fast enough, define FAST_DIVIDE.
651      */
652 #ifdef FAST_DIVIDE
653 #define DIVIDE_BY(a,b)  a /= b
654 #else
655 #define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
656 #endif
657     if (temp < 0) {
658       temp = -temp;
659       temp += qval>>1;  /* for rounding */
660       DIVIDE_BY(temp, qval);
661       temp = -temp;
662     } else {
663       temp += qval>>1;  /* for rounding */
664       DIVIDE_BY(temp, qval);
665     }
666     output_ptr[i] = (JCOEF) temp;
667   }
668 
669 #endif
670 
671 }
672 
673 
674 /*
675  * Perform forward DCT on one or more blocks of a component.
676  *
677  * The input samples are taken from the sample_data[] array starting at
678  * position start_row/start_col, and moving to the right for any additional
679  * blocks. The quantized coefficients are returned in coef_blocks[].
680  */
681 
682 METHODDEF(void)
forward_DCT(j_compress_ptr cinfo,jpeg_component_info * compptr,JSAMPARRAY sample_data,JBLOCKROW coef_blocks,JDIMENSION start_row,JDIMENSION start_col,JDIMENSION num_blocks,JBLOCKROW dst)683 forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
684              JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
685              JDIMENSION start_row, JDIMENSION start_col,
686              JDIMENSION num_blocks, JBLOCKROW dst)
687 /* This version is used for integer DCT implementations. */
688 {
689   /* This routine is heavily used, so it's worth coding it tightly. */
690   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
691   DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
692   JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[compptr->quant_tbl_no];
693   DCTELEM *workspace;
694   JDIMENSION bi;
695 
696   /* Make sure the compiler doesn't look up these every pass */
697   forward_DCT_method_ptr do_dct = fdct->dct;
698   convsamp_method_ptr do_convsamp = fdct->convsamp;
699   preprocess_method_ptr do_preprocess = fdct->preprocess;
700   quantize_method_ptr do_quantize = fdct->quantize;
701   workspace = fdct->workspace;
702 
703   sample_data += start_row;     /* fold in the vertical offset once */
704 
705   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
706     /* Load data into workspace, applying unsigned->signed conversion */
707     (*do_convsamp) (sample_data, start_col, workspace);
708 
709     if (do_preprocess) {
710       (*do_preprocess) (workspace, qtbl);
711     }
712 
713     /* Perform the DCT */
714     (*do_dct) (workspace);
715 
716     /* Save unquantized transform coefficients for later trellis quantization */
717     if (dst) {
718       int i;
719       if (cinfo->dct_method == JDCT_IFAST) {
720         static const INT16 aanscales[DCTSIZE2] = {
721           /* precomputed values scaled up by 14 bits */
722           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
723           22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
724           21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
725           19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
726           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
727           12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
728           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
729           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
730         };
731 
732         for (i = 0; i < DCTSIZE2; i++) {
733           int x = workspace[i];
734           int s = aanscales[i];
735           x = (x >= 0) ? (x * 32768 + s) / (2*s) : (x * 32768 - s) / (2*s);
736           dst[bi][i] = x;
737         }
738 
739       } else {
740         for (i = 0; i < DCTSIZE2; i++) {
741           dst[bi][i] = workspace[i];
742         }
743       }
744     }
745 
746     /* Quantize/descale the coefficients, and store into coef_blocks[] */
747     (*do_quantize) (coef_blocks[bi], divisors, workspace);
748 
749     if (do_preprocess) {
750       int i;
751       int maxval = (1 << MAX_COEF_BITS) - 1;
752       for (i = 0; i < 64; i++) {
753         if (coef_blocks[bi][i] < -maxval)
754           coef_blocks[bi][i] = -maxval;
755         if (coef_blocks[bi][i] > maxval)
756           coef_blocks[bi][i] = maxval;
757   }
758   }
759 }
760 }
761 
762 
763 #ifdef DCT_FLOAT_SUPPORTED
764 
765 METHODDEF(void)
convsamp_float(JSAMPARRAY sample_data,JDIMENSION start_col,FAST_FLOAT * workspace)766 convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
767                FAST_FLOAT *workspace)
768 {
769   register FAST_FLOAT *workspaceptr;
770   register JSAMPROW elemptr;
771   register int elemr;
772 
773   workspaceptr = workspace;
774   for (elemr = 0; elemr < DCTSIZE; elemr++) {
775     elemptr = sample_data[elemr] + start_col;
776 #if DCTSIZE == 8                /* unroll the inner loop */
777     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
778     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
779     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
780     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
781     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
782     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
783     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
784     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
785 #else
786     {
787       register int elemc;
788       for (elemc = DCTSIZE; elemc > 0; elemc--)
789         *workspaceptr++ = (FAST_FLOAT)
790                           (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
791     }
792 #endif
793   }
794 }
795 
796 
797 METHODDEF(void)
quantize_float(JCOEFPTR coef_block,FAST_FLOAT * divisors,FAST_FLOAT * workspace)798 quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
799                FAST_FLOAT *workspace)
800 {
801   register FAST_FLOAT temp;
802   register int i;
803   register JCOEFPTR output_ptr = coef_block;
804 
805   for (i = 0; i < DCTSIZE2; i++) {
806     /* Apply the quantization and scaling factor */
807     temp = workspace[i] * divisors[i];
808 
809     /* Round to nearest integer.
810      * Since C does not specify the direction of rounding for negative
811      * quotients, we have to force the dividend positive for portability.
812      * The maximum coefficient size is +-16K (for 12-bit data), so this
813      * code should work for either 16-bit or 32-bit ints.
814      */
815     output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
816   }
817 }
818 
819 
820 METHODDEF(void)
forward_DCT_float(j_compress_ptr cinfo,jpeg_component_info * compptr,JSAMPARRAY sample_data,JBLOCKROW coef_blocks,JDIMENSION start_row,JDIMENSION start_col,JDIMENSION num_blocks,JBLOCKROW dst)821 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
822                    JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
823                    JDIMENSION start_row, JDIMENSION start_col,
824                    JDIMENSION num_blocks, JBLOCKROW dst)
825 /* This version is used for floating-point DCT implementations. */
826 {
827   /* This routine is heavily used, so it's worth coding it tightly. */
828   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
829   FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
830   JQUANT_TBL *qtbl = cinfo->quant_tbl_ptrs[compptr->quant_tbl_no];
831   FAST_FLOAT *workspace;
832   JDIMENSION bi;
833   float v;
834   int x;
835 
836 
837   /* Make sure the compiler doesn't look up these every pass */
838   float_DCT_method_ptr do_dct = fdct->float_dct;
839   float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
840   float_preprocess_method_ptr do_preprocess = fdct->float_preprocess;
841   float_quantize_method_ptr do_quantize = fdct->float_quantize;
842   workspace = fdct->float_workspace;
843 
844   sample_data += start_row;     /* fold in the vertical offset once */
845 
846   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
847     /* Load data into workspace, applying unsigned->signed conversion */
848     (*do_convsamp) (sample_data, start_col, workspace);
849 
850     if (do_preprocess) {
851       (*do_preprocess) (workspace, qtbl);
852     }
853 
854     /* Perform the DCT */
855     (*do_dct) (workspace);
856 
857     /* Save unquantized transform coefficients for later trellis quantization */
858     /* Currently save as integer values. Could save float values but would require */
859     /* modifications to memory allocation and trellis quantization */
860 
861     if (dst) {
862       int i;
863       static const double aanscalefactor[DCTSIZE] = {
864         1.0, 1.387039845, 1.306562965, 1.175875602,
865         1.0, 0.785694958, 0.541196100, 0.275899379
866       };
867 
868       for (i = 0; i < DCTSIZE2; i++) {
869         v = workspace[i];
870         v /= aanscalefactor[i%8];
871         v /= aanscalefactor[i/8];
872         x = (v >= 0.0) ? (int)(v + 0.5) : (int)(v - 0.5);
873         dst[bi][i] = x;
874       }
875     }
876 
877     /* Quantize/descale the coefficients, and store into coef_blocks[] */
878     (*do_quantize) (coef_blocks[bi], divisors, workspace);
879 
880     if (do_preprocess) {
881       int i;
882       int maxval = (1 << MAX_COEF_BITS) - 1;
883       for (i = 0; i < 64; i++) {
884         if (coef_blocks[bi][i] < -maxval)
885           coef_blocks[bi][i] = -maxval;
886         if (coef_blocks[bi][i] > maxval)
887           coef_blocks[bi][i] = maxval;
888   }
889 }
890   }
891 }
892 
893 #endif /* DCT_FLOAT_SUPPORTED */
894 
895 #include "jpeg_nbits_table.h"
896 
897 static const float jpeg_lambda_weights_flat[64] = {
898   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
899   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
900   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
901   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
902   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
903   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
904   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
905   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
906 };
907 
908 static const float jpeg_lambda_weights_csf_luma[64] = {
909   3.35630f, 3.59892f, 3.20921f, 2.28102f, 1.42378f, 0.88079f, 0.58190f, 0.43454f,
910   3.59893f, 3.21284f, 2.71282f, 1.98092f, 1.30506f, 0.83852f, 0.56346f, 0.42146f,
911   3.20921f, 2.71282f, 2.12574f, 1.48616f, 0.99660f, 0.66132f, 0.45610f, 0.34609f,
912   2.28102f, 1.98092f, 1.48616f, 0.97492f, 0.64622f, 0.43812f, 0.31074f, 0.24072f,
913   1.42378f, 1.30506f, 0.99660f, 0.64623f, 0.42051f, 0.28446f, 0.20380f, 0.15975f,
914   0.88079f, 0.83852f, 0.66132f, 0.43812f, 0.28446f, 0.19092f, 0.13635f, 0.10701f,
915   0.58190f, 0.56346f, 0.45610f, 0.31074f, 0.20380f, 0.13635f, 0.09674f, 0.07558f,
916   0.43454f, 0.42146f, 0.34609f, 0.24072f, 0.15975f, 0.10701f, 0.07558f, 0.05875f,
917 };
918 
919 #define DC_TRELLIS_MAX_CANDIDATES 9
920 
get_num_dc_trellis_candidates(int dc_quantval)921 LOCAL(int) get_num_dc_trellis_candidates(int dc_quantval) {
922   /* Higher qualities can tolerate higher DC distortion */
923   return MIN(DC_TRELLIS_MAX_CANDIDATES, (2 + 60 / dc_quantval)|1);
924 }
925 
926 GLOBAL(void)
quantize_trellis(j_compress_ptr cinfo,c_derived_tbl * dctbl,c_derived_tbl * actbl,JBLOCKROW coef_blocks,JBLOCKROW src,JDIMENSION num_blocks,JQUANT_TBL * qtbl,double * norm_src,double * norm_coef,JCOEF * last_dc_val,JBLOCKROW coef_blocks_above,JBLOCKROW src_above)927 quantize_trellis(j_compress_ptr cinfo, c_derived_tbl *dctbl, c_derived_tbl *actbl, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
928                  JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
929                  JBLOCKROW coef_blocks_above, JBLOCKROW src_above)
930 {
931   int i, j, k, l;
932   float accumulated_zero_dist[DCTSIZE2];
933   float accumulated_cost[DCTSIZE2];
934   int run_start[DCTSIZE2];
935   int bi;
936   float best_cost;
937   int last_coeff_idx; /* position of last nonzero coefficient */
938   float norm = 0.0;
939   float lambda_base;
940   float lambda;
941   float lambda_dc;
942   const float *lambda_tbl = (cinfo->master->use_lambda_weight_tbl) ?
943                             jpeg_lambda_weights_csf_luma :
944                             jpeg_lambda_weights_flat;
945   int Ss, Se;
946   float *accumulated_zero_block_cost = NULL;
947   float *accumulated_block_cost = NULL;
948   int *block_run_start = NULL;
949   int *requires_eob = NULL;
950   int has_eob;
951   float cost_all_zeros;
952   float best_cost_skip;
953   float cost;
954   int zero_run;
955   int run_bits;
956   int rate;
957   float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
958   int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
959   JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
960   int mode = 1;
961   float lambda_table[DCTSIZE2];
962   const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
963 
964   Ss = cinfo->Ss;
965   Se = cinfo->Se;
966   if (Ss == 0)
967     Ss = 1;
968   if (Se < Ss)
969     return;
970   if (cinfo->master->trellis_eob_opt) {
971     accumulated_zero_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
972     accumulated_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
973     block_run_start = (int *)malloc(num_blocks * sizeof(int));
974     requires_eob = (int *)malloc((num_blocks + 1) * sizeof(int));
975     if (!accumulated_zero_block_cost ||
976         !accumulated_block_cost ||
977         !block_run_start ||
978         !requires_eob) {
979       ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
980     }
981 
982     accumulated_zero_block_cost[0] = 0;
983     accumulated_block_cost[0] = 0;
984     requires_eob[0] = 0;
985   }
986 
987   if (cinfo->master->trellis_quant_dc) {
988     for (i = 0; i < dc_trellis_candidates; i++) {
989       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
990       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
991       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
992       if (!accumulated_dc_cost[i] ||
993           !dc_cost_backtrack[i] ||
994           !dc_candidate[i]) {
995         ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
996       }
997     }
998   }
999 
1000   norm = 0.0;
1001   for (i = 1; i < DCTSIZE2; i++) {
1002     norm += qtbl->quantval[i] * qtbl->quantval[i];
1003   }
1004   norm /= 63.0;
1005 
1006   if (mode == 1) {
1007     lambda_base = 1.0;
1008     lambda_tbl = lambda_table;
1009     for (i = 0; i < DCTSIZE2; i++)
1010       lambda_table[i] = 1.0 / (qtbl->quantval[i] * qtbl->quantval[i]);
1011   } else
1012     lambda_base = 1.0 / norm;
1013 
1014   for (bi = 0; bi < num_blocks; bi++) {
1015 
1016     norm = 0.0;
1017     for (i = 1; i < DCTSIZE2; i++) {
1018       norm += src[bi][i] * src[bi][i];
1019     }
1020     norm /= 63.0;
1021 
1022     if (cinfo->master->lambda_log_scale2 > 0.0)
1023       lambda = pow(2.0, cinfo->master->lambda_log_scale1) * lambda_base /
1024                    (pow(2.0, cinfo->master->lambda_log_scale2) + norm);
1025     else
1026       lambda = pow(2.0, cinfo->master->lambda_log_scale1 - 12.0) * lambda_base;
1027 
1028     lambda_dc = lambda * lambda_tbl[0];
1029 
1030     accumulated_zero_dist[Ss-1] = 0.0;
1031     accumulated_cost[Ss-1] = 0.0;
1032 
1033     /* Do DC coefficient */
1034     if (cinfo->master->trellis_quant_dc) {
1035       int sign = src[bi][0] >> 31;
1036       int x = abs(src[bi][0]);
1037       int q = 8 * qtbl->quantval[0];
1038       int qval;
1039       float dc_candidate_dist;
1040 
1041       qval = (x + q/2) / q; /* quantized value (round nearest) */
1042       for (k = 0; k < dc_trellis_candidates; k++) {
1043         int delta;
1044         int dc_delta;
1045         int bits;
1046 
1047         dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
1048         if (dc_candidate[k][bi] >= (1<<MAX_COEF_BITS))
1049           dc_candidate[k][bi] = (1<<MAX_COEF_BITS)-1;
1050         if (dc_candidate[k][bi] <= -(1<<MAX_COEF_BITS))
1051           dc_candidate[k][bi] = -(1<<MAX_COEF_BITS)+1;
1052 
1053         delta = dc_candidate[k][bi] * q - x;
1054         dc_candidate_dist = delta * delta * lambda_dc;
1055         dc_candidate[k][bi] *= 1 + 2*sign;
1056 
1057         /* Take into account DC differences */
1058         if (coef_blocks_above && src_above && cinfo->master->trellis_delta_dc_weight > 0.0) {
1059           int dc_above_orig;
1060           int dc_above_recon;
1061           int dc_orig;
1062           int dc_recon;
1063           float vertical_dist;
1064 
1065           dc_above_orig = src_above[bi][0];
1066           dc_above_recon = coef_blocks_above[bi][0] * q;
1067           dc_orig = src[bi][0];
1068           dc_recon = dc_candidate[k][bi] * q;
1069           /* delta is difference of vertical gradients */
1070           delta = (dc_above_orig - dc_orig) - (dc_above_recon - dc_recon);
1071           vertical_dist = delta * delta * lambda_dc;
1072           dc_candidate_dist +=  cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
1073         }
1074 
1075         if (bi == 0) {
1076           dc_delta = dc_candidate[k][bi] - *last_dc_val;
1077 
1078           /* Derive number of suffix bits */
1079           bits = 0;
1080           dc_delta = abs(dc_delta);
1081           while (dc_delta) {
1082             dc_delta >>= 1;
1083             bits++;
1084           }
1085           cost = bits + dctbl->ehufsi[bits] + dc_candidate_dist;
1086           accumulated_dc_cost[k][0] = cost;
1087           dc_cost_backtrack[k][0] = -1;
1088         } else {
1089           for (l = 0; l < dc_trellis_candidates; l++) {
1090             dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];
1091 
1092             /* Derive number of suffix bits */
1093             bits = 0;
1094             dc_delta = abs(dc_delta);
1095             while (dc_delta) {
1096               dc_delta >>= 1;
1097               bits++;
1098             }
1099             cost = bits + dctbl->ehufsi[bits] + dc_candidate_dist + accumulated_dc_cost[l][bi-1];
1100             if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
1101               accumulated_dc_cost[k][bi] = cost;
1102               dc_cost_backtrack[k][bi] = l;
1103             }
1104           }
1105         }
1106       }
1107     }
1108 
1109     /* Do AC coefficients */
1110     for (i = Ss; i <= Se; i++) {
1111       int z = jpeg_natural_order[i];
1112 
1113       int sign = src[bi][z] >> 31;
1114       int x = abs(src[bi][z]);
1115       int q = 8 * qtbl->quantval[z];
1116       int candidate[16];
1117       int candidate_bits[16];
1118       float candidate_dist[16];
1119       int num_candidates;
1120       int qval;
1121 
1122       accumulated_zero_dist[i] = x * x * lambda * lambda_tbl[z] + accumulated_zero_dist[i-1];
1123 
1124       qval = (x + q/2) / q; /* quantized value (round nearest) */
1125 
1126       if (qval == 0) {
1127         coef_blocks[bi][z] = 0;
1128         accumulated_cost[i] = 1e38; /* Shouldn't be needed */
1129         continue;
1130       }
1131 
1132       if (qval >= (1<<MAX_COEF_BITS))
1133         qval = (1<<MAX_COEF_BITS)-1;
1134 
1135       num_candidates = jpeg_nbits_table[qval];
1136       for (k = 0; k < num_candidates; k++) {
1137         int delta;
1138         candidate[k] = (k < num_candidates - 1) ? (2 << k) - 1 : qval;
1139         delta = candidate[k] * q - x;
1140         candidate_bits[k] = k+1;
1141         candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
1142       }
1143 
1144       accumulated_cost[i] = 1e38;
1145 
1146       for (j = Ss-1; j < i; j++) {
1147         int zz = jpeg_natural_order[j];
1148         if (j != Ss-1 && coef_blocks[bi][zz] == 0)
1149           continue;
1150 
1151         zero_run = i - 1 - j;
1152         if ((zero_run >> 4) && actbl->ehufsi[0xf0] == 0)
1153           continue;
1154 
1155         run_bits = (zero_run >> 4) * actbl->ehufsi[0xf0];
1156         zero_run &= 15;
1157 
1158         for (k = 0; k < num_candidates; k++) {
1159           int coef_bits = actbl->ehufsi[16 * zero_run + candidate_bits[k]];
1160           if (coef_bits == 0)
1161             continue;
1162 
1163           rate = coef_bits + candidate_bits[k] + run_bits;
1164           cost = rate + candidate_dist[k];
1165           cost += accumulated_zero_dist[i-1] - accumulated_zero_dist[j] + accumulated_cost[j];
1166 
1167           if (cost < accumulated_cost[i]) {
1168             coef_blocks[bi][z] = (candidate[k] ^ sign) - sign;
1169             accumulated_cost[i] = cost;
1170             run_start[i] = j;
1171           }
1172         }
1173       }
1174     }
1175 
1176     last_coeff_idx = Ss-1;
1177     best_cost = accumulated_zero_dist[Se] + actbl->ehufsi[0];
1178     cost_all_zeros = accumulated_zero_dist[Se];
1179     best_cost_skip = cost_all_zeros;
1180 
1181     for (i = Ss; i <= Se; i++) {
1182       int z = jpeg_natural_order[i];
1183       if (coef_blocks[bi][z] != 0) {
1184         float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
1185         float cost_wo_eob = cost;
1186 
1187         if (i < Se)
1188           cost += actbl->ehufsi[0];
1189 
1190         if (cost < best_cost) {
1191           best_cost = cost;
1192           last_coeff_idx = i;
1193           best_cost_skip = cost_wo_eob;
1194         }
1195       }
1196     }
1197 
1198     has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);
1199 
1200     /* Zero out coefficients that are part of runs */
1201     i = Se;
1202     while (i >= Ss)
1203     {
1204       while (i > last_coeff_idx) {
1205         int z = jpeg_natural_order[i];
1206         coef_blocks[bi][z] = 0;
1207         i--;
1208       }
1209       last_coeff_idx = run_start[i];
1210       i--;
1211     }
1212 
1213     if (cinfo->master->trellis_eob_opt) {
1214       accumulated_zero_block_cost[bi+1] = accumulated_zero_block_cost[bi];
1215       accumulated_zero_block_cost[bi+1] += cost_all_zeros;
1216       requires_eob[bi+1] = has_eob;
1217 
1218       best_cost = 1e38;
1219 
1220       if (has_eob != 2) {
1221         for (i = 0; i <= bi; i++) {
1222           int zero_block_run;
1223           int nbits;
1224           float cost;
1225 
1226           if (requires_eob[i] == 2)
1227             continue;
1228 
1229           cost = best_cost_skip; /* cost of coding a nonzero block */
1230           cost += accumulated_zero_block_cost[bi];
1231           cost -= accumulated_zero_block_cost[i];
1232           cost += accumulated_block_cost[i];
1233           zero_block_run = bi - i + requires_eob[i];
1234           nbits = jpeg_nbits_table[zero_block_run];
1235           cost += actbl->ehufsi[16*nbits] + nbits;
1236 
1237           if (cost < best_cost) {
1238             block_run_start[bi] = i;
1239             best_cost = cost;
1240             accumulated_block_cost[bi+1] = cost;
1241           }
1242         }
1243       }
1244     }
1245   }
1246 
1247   if (cinfo->master->trellis_eob_opt) {
1248     int last_block = num_blocks;
1249     best_cost = 1e38;
1250 
1251     for (i = 0; i <= num_blocks; i++) {
1252       int zero_block_run;
1253       int nbits;
1254       float cost = 0.0;
1255 
1256       if (requires_eob[i] == 2)
1257         continue;
1258 
1259       cost += accumulated_zero_block_cost[num_blocks];
1260       cost -= accumulated_zero_block_cost[i];
1261       zero_block_run = num_blocks - i + requires_eob[i];
1262       nbits = jpeg_nbits_table[zero_block_run];
1263       cost += actbl->ehufsi[16*nbits] + nbits;
1264       if (cost < best_cost) {
1265         best_cost = cost;
1266         last_block = i;
1267       }
1268     }
1269     last_block--;
1270     bi = num_blocks - 1;
1271     while (bi >= 0) {
1272       while (bi > last_block) {
1273         for (j = Ss; j <= Se; j++) {
1274           int z = jpeg_natural_order[j];
1275           coef_blocks[bi][z] = 0;
1276         }
1277         bi--;
1278       }
1279       last_block = block_run_start[bi]-1;
1280       bi--;
1281     }
1282     free(accumulated_zero_block_cost);
1283     free(accumulated_block_cost);
1284     free(block_run_start);
1285     free(requires_eob);
1286   }
1287 
1288   if (cinfo->master->trellis_q_opt) {
1289     for (bi = 0; bi < num_blocks; bi++) {
1290       for (i = 1; i < DCTSIZE2; i++) {
1291         norm_src[i] += src[bi][i] * coef_blocks[bi][i];
1292         norm_coef[i] += 8 * coef_blocks[bi][i] * coef_blocks[bi][i];
1293       }
1294     }
1295   }
1296 
1297   if (cinfo->master->trellis_quant_dc) {
1298     j = 0;
1299     for (i = 1; i < dc_trellis_candidates; i++) {
1300       if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
1301         j = i;
1302     }
1303     for (bi = num_blocks-1; bi >= 0; bi--) {
1304       coef_blocks[bi][0] = dc_candidate[j][bi];
1305       j = dc_cost_backtrack[j][bi];
1306     }
1307 
1308     /* Save DC predictor */
1309     *last_dc_val = coef_blocks[num_blocks-1][0];
1310 
1311     for (i = 0; i < dc_trellis_candidates; i++) {
1312       free(accumulated_dc_cost[i]);
1313       free(dc_cost_backtrack[i]);
1314       free(dc_candidate[i]);
1315     }
1316   }
1317 
1318 }
1319 
1320 #ifdef C_ARITH_CODING_SUPPORTED
1321 GLOBAL(void)
quantize_trellis_arith(j_compress_ptr cinfo,arith_rates * r,JBLOCKROW coef_blocks,JBLOCKROW src,JDIMENSION num_blocks,JQUANT_TBL * qtbl,double * norm_src,double * norm_coef,JCOEF * last_dc_val,JBLOCKROW coef_blocks_above,JBLOCKROW src_above)1322 quantize_trellis_arith(j_compress_ptr cinfo, arith_rates *r, JBLOCKROW coef_blocks, JBLOCKROW src, JDIMENSION num_blocks,
1323                  JQUANT_TBL * qtbl, double *norm_src, double *norm_coef, JCOEF *last_dc_val,
1324                  JBLOCKROW coef_blocks_above, JBLOCKROW src_above)
1325 {
1326   int i, j, k, l;
1327   float accumulated_zero_dist[DCTSIZE2];
1328   float accumulated_cost[DCTSIZE2];
1329   int run_start[DCTSIZE2];
1330   int bi;
1331   float best_cost;
1332   int last_coeff_idx; /* position of last nonzero coefficient */
1333   float norm = 0.0;
1334   float lambda_base;
1335   float lambda;
1336   float lambda_dc;
1337   const float *lambda_tbl = (cinfo->master->use_lambda_weight_tbl) ?
1338   jpeg_lambda_weights_csf_luma :
1339   jpeg_lambda_weights_flat;
1340   int Ss, Se;
1341   float cost;
1342   float run_bits;
1343   int rate;
1344   float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
1345   int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
1346   JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
1347   int *dc_context[DC_TRELLIS_MAX_CANDIDATES];
1348 
1349   int mode = 1;
1350   float lambda_table[DCTSIZE2];
1351   const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);
1352 
1353   Ss = cinfo->Ss;
1354   Se = cinfo->Se;
1355   if (Ss == 0)
1356     Ss = 1;
1357   if (Se < Ss)
1358     return;
1359 
1360   if (cinfo->master->trellis_quant_dc) {
1361     for (i = 0; i < dc_trellis_candidates; i++) {
1362       accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
1363       dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
1364       dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
1365       dc_context[i] = (int *)malloc(num_blocks * sizeof(int));
1366       if (!accumulated_dc_cost[i] ||
1367           !dc_cost_backtrack[i] ||
1368           !dc_candidate[i] ||
1369           !dc_context[i]) {
1370         ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
1371       }
1372     }
1373   }
1374 
1375   norm = 0.0;
1376   for (i = 1; i < DCTSIZE2; i++) {
1377     norm += qtbl->quantval[i] * qtbl->quantval[i];
1378   }
1379   norm /= 63.0;
1380 
1381   if (mode == 1) {
1382     lambda_base = 1.0;
1383     lambda_tbl = lambda_table;
1384     for (i = 0; i < DCTSIZE2; i++)
1385       lambda_table[i] = 1.0 / (qtbl->quantval[i] * qtbl->quantval[i]);
1386   } else
1387     lambda_base = 1.0 / norm;
1388 
1389   for (bi = 0; bi < num_blocks; bi++) {
1390 
1391     norm = 0.0;
1392     for (i = 1; i < DCTSIZE2; i++) {
1393       norm += src[bi][i] * src[bi][i];
1394     }
1395     norm /= 63.0;
1396 
1397     if (cinfo->master->lambda_log_scale2 > 0.0)
1398       lambda = pow(2.0, cinfo->master->lambda_log_scale1) * lambda_base /
1399       (pow(2.0, cinfo->master->lambda_log_scale2) + norm);
1400     else
1401       lambda = pow(2.0, cinfo->master->lambda_log_scale1 - 12.0) * lambda_base;
1402 
1403     lambda_dc = lambda * lambda_tbl[0];
1404 
1405     accumulated_zero_dist[Ss-1] = 0.0;
1406     accumulated_cost[Ss-1] = 0.0;
1407 
1408     /* Do DC coefficient */
1409     if (cinfo->master->trellis_quant_dc) {
1410       int sign = src[bi][0] >> 31;
1411       int x = abs(src[bi][0]);
1412       int q = 8 * qtbl->quantval[0];
1413       int qval;
1414       float dc_candidate_dist;
1415 
1416       qval = (x + q/2) / q; /* quantized value (round nearest) */
1417 
1418       /* loop over candidates in current block */
1419       for (k = 0; k < dc_trellis_candidates; k++) {
1420         int delta;
1421         int dc_delta;
1422         float bits;
1423         int m;
1424         int v2;
1425 
1426         dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
1427         delta = dc_candidate[k][bi] * q - x;
1428         dc_candidate_dist = delta * delta * lambda_dc;
1429         dc_candidate[k][bi] *= 1 + 2*sign;
1430 
1431         /* Take into account DC differences */
1432         if (coef_blocks_above && src_above && cinfo->master->trellis_delta_dc_weight > 0.0) {
1433           int dc_above_orig;
1434           int dc_above_recon;
1435           int dc_orig;
1436           int dc_recon;
1437           float vertical_dist;
1438 
1439           dc_above_orig = src_above[bi][0];
1440           dc_above_recon = coef_blocks_above[bi][0] * q;
1441           dc_orig = src[bi][0];
1442           dc_recon = dc_candidate[k][bi] * q;
1443           /* delta is difference of vertical gradients */
1444           delta = (dc_above_orig - dc_orig) - (dc_above_recon - dc_recon);
1445           vertical_dist = delta * delta * lambda_dc;
1446           dc_candidate_dist +=  cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
1447         }
1448 
1449         /* loop of candidates from previous block */
1450         for (l = 0; l < (bi == 0 ? 1 : dc_trellis_candidates); l++) {
1451           int dc_pred = (bi == 0 ? *last_dc_val : dc_candidate[l][bi-1]);
1452           int updated_dc_context = 0;
1453           int st = (bi == 0) ? 0 : dc_context[l][bi-1];
1454           dc_delta = dc_candidate[k][bi] - dc_pred;
1455 
1456           bits = r->rate_dc[st][dc_delta != 0];
1457 
1458           if (dc_delta != 0) {
1459             bits += r->rate_dc[st+1][dc_delta < 0];
1460             st += 2 + (dc_delta < 0);
1461             updated_dc_context = (dc_delta < 0) ? 8 : 4;
1462 
1463             dc_delta = abs(dc_delta);
1464 
1465             m = 0;
1466             if (dc_delta -= 1) {
1467               bits += r->rate_dc[st][1];
1468               st = 20;
1469               m = 1;
1470               v2 = dc_delta;
1471               while (v2 >>= 1) {
1472                 bits += r->rate_dc[st][1];
1473                 m <<= 1;
1474                 st++;
1475               }
1476             }
1477             bits += r->rate_dc[st][0];
1478 
1479             if (m < (int) ((1L << r->arith_dc_L) >> 1))
1480               updated_dc_context = 0;    /* zero diff category */
1481             else if (m > (int) ((1L << r->arith_dc_U) >> 1))
1482               updated_dc_context += 8;   /* large diff category */
1483 
1484             st += 14;
1485             while (m >>= 1)
1486               bits += r->rate_dc[st][(m & dc_delta) ? 1 : 0];
1487           }
1488 
1489           cost = bits + dc_candidate_dist;
1490           if (bi != 0)
1491             cost += accumulated_dc_cost[l][bi-1];
1492 
1493           if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
1494             accumulated_dc_cost[k][bi] = cost;
1495             dc_cost_backtrack[k][bi] = (bi == 0 ? -1 : l);
1496             dc_context[k][bi] = updated_dc_context;
1497           }
1498         }
1499       }
1500     }
1501 
1502     /* Do AC coefficients */
1503     for (i = Ss; i <= Se; i++) {
1504       int z = jpeg_natural_order[i];
1505 
1506       int sign = src[bi][z] >> 31;
1507       int x = abs(src[bi][z]);
1508       int q = 8 * qtbl->quantval[z];
1509       int candidate[16];
1510       float candidate_dist[16];
1511       int num_candidates;
1512       int qval;
1513       int delta;
1514 
1515       accumulated_zero_dist[i] = x * x * lambda * lambda_tbl[z] + accumulated_zero_dist[i-1];
1516 
1517       qval = (x + q/2) / q; /* quantized value (round nearest) */
1518 
1519       if (qval == 0) {
1520         coef_blocks[bi][z] = 0;
1521         accumulated_cost[i] = 1e38; /* Shouldn't be needed */
1522         continue;
1523       }
1524 
1525       k = 0;
1526       candidate[k] = qval;
1527       delta = candidate[k] * q - x;
1528       candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
1529       k++;
1530       if (qval > 1) {
1531         candidate[k] = qval - 1;
1532         delta = candidate[k] * q - x;
1533         candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
1534         k++;
1535       }
1536       num_candidates = k;
1537 
1538       accumulated_cost[i] = 1e38;
1539 
1540       for (j = Ss-1; j < i; j++) {
1541         int zz = jpeg_natural_order[j];
1542         if (j != Ss-1 && coef_blocks[bi][zz] == 0)
1543           continue;
1544 
1545         run_bits = r->rate_ac[3*j][0]; /* EOB */
1546         for (k = j+1; k < i; k++)
1547           run_bits += r->rate_ac[3*(k-1)+1][0];
1548         run_bits += r->rate_ac[3*(i-1)+1][1];
1549 
1550         for (k = 0; k < num_candidates; k++) {
1551           float coef_bits = 1.0; /* sign bit */
1552           int v = candidate[k];
1553           int v2;
1554           int m;
1555           int st;
1556 
1557           st = 3*(i-1)+2;
1558           m = 0;
1559           if (v -= 1) {
1560             coef_bits += r->rate_ac[st][1];
1561             m = 1;
1562             v2 = v;
1563             if (v2 >>= 1) {
1564               coef_bits += r->rate_ac[st][1];
1565               m <<= 1;
1566               st = (i <= r->arith_ac_K) ? 189 : 217;
1567               while (v2 >>= 1) {
1568                 coef_bits += r->rate_ac[st][1];
1569                 m <<= 1;
1570                 st++;
1571               }
1572             }
1573           }
1574           coef_bits += r->rate_ac[st][0];
1575           st += 14;
1576           while (m >>= 1)
1577             coef_bits += r->rate_ac[st][(m & v) ? 1 : 0];
1578 
1579           rate = coef_bits + run_bits;
1580           cost = rate + candidate_dist[k];
1581           cost += accumulated_zero_dist[i-1] - accumulated_zero_dist[j] + accumulated_cost[j];
1582 
1583           if (cost < accumulated_cost[i]) {
1584             coef_blocks[bi][z] = (candidate[k] ^ sign) - sign;
1585             accumulated_cost[i] = cost;
1586             run_start[i] = j;
1587           }
1588         }
1589       }
1590     }
1591 
1592     last_coeff_idx = Ss-1;
1593     best_cost = accumulated_zero_dist[Se] + r->rate_ac[0][1];
1594 
1595     for (i = Ss; i <= Se; i++) {
1596       int z = jpeg_natural_order[i];
1597       if (coef_blocks[bi][z] != 0) {
1598         float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
1599 
1600         if (i < Se)
1601           cost += r->rate_ac[3*(i-1)][1];
1602 
1603         if (cost < best_cost) {
1604           best_cost = cost;
1605           last_coeff_idx = i;
1606         }
1607       }
1608     }
1609 
1610     /* Zero out coefficients that are part of runs */
1611     i = Se;
1612     while (i >= Ss)
1613     {
1614       while (i > last_coeff_idx) {
1615         int z = jpeg_natural_order[i];
1616         coef_blocks[bi][z] = 0;
1617         i--;
1618       }
1619       last_coeff_idx = run_start[i];
1620       i--;
1621     }
1622 
1623   }
1624 
1625   if (cinfo->master->trellis_q_opt) {
1626     for (bi = 0; bi < num_blocks; bi++) {
1627       for (i = 1; i < DCTSIZE2; i++) {
1628         norm_src[i] += src[bi][i] * coef_blocks[bi][i];
1629         norm_coef[i] += 8 * coef_blocks[bi][i] * coef_blocks[bi][i];
1630       }
1631     }
1632   }
1633 
1634   if (cinfo->master->trellis_quant_dc) {
1635     j = 0;
1636     for (i = 1; i < dc_trellis_candidates; i++) {
1637       if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
1638         j = i;
1639     }
1640     for (bi = num_blocks-1; bi >= 0; bi--) {
1641       coef_blocks[bi][0] = dc_candidate[j][bi];
1642       j = dc_cost_backtrack[j][bi];
1643     }
1644 
1645     /* Save DC predictor */
1646     *last_dc_val = coef_blocks[num_blocks-1][0];
1647 
1648     for (i = 0; i < dc_trellis_candidates; i++) {
1649       free(accumulated_dc_cost[i]);
1650       free(dc_cost_backtrack[i]);
1651       free(dc_candidate[i]);
1652       free(dc_context[i]);
1653     }
1654   }
1655 }
1656 #endif
1657 
1658 /*
1659  * Initialize FDCT manager.
1660  */
1661 
1662 GLOBAL(void)
jinit_forward_dct(j_compress_ptr cinfo)1663 jinit_forward_dct (j_compress_ptr cinfo)
1664 {
1665   my_fdct_ptr fdct;
1666   int i;
1667 
1668   fdct = (my_fdct_ptr)
1669     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1670                                 sizeof(my_fdct_controller));
1671   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
1672   fdct->pub.start_pass = start_pass_fdctmgr;
1673 
1674   /* First determine the DCT... */
1675   switch (cinfo->dct_method) {
1676 #ifdef DCT_ISLOW_SUPPORTED
1677   case JDCT_ISLOW:
1678     fdct->pub.forward_DCT = forward_DCT;
1679     if (jsimd_can_fdct_islow())
1680       fdct->dct = jsimd_fdct_islow;
1681     else
1682       fdct->dct = jpeg_fdct_islow;
1683     break;
1684 #endif
1685 #ifdef DCT_IFAST_SUPPORTED
1686   case JDCT_IFAST:
1687     fdct->pub.forward_DCT = forward_DCT;
1688     if (jsimd_can_fdct_ifast())
1689       fdct->dct = jsimd_fdct_ifast;
1690     else
1691       fdct->dct = jpeg_fdct_ifast;
1692     break;
1693 #endif
1694 #ifdef DCT_FLOAT_SUPPORTED
1695   case JDCT_FLOAT:
1696     fdct->pub.forward_DCT = forward_DCT_float;
1697     if (jsimd_can_fdct_float())
1698       fdct->float_dct = jsimd_fdct_float;
1699     else
1700       fdct->float_dct = jpeg_fdct_float;
1701     break;
1702 #endif
1703   default:
1704     ERREXIT(cinfo, JERR_NOT_COMPILED);
1705     break;
1706   }
1707 
1708   /* ...then the supporting stages. */
1709   switch (cinfo->dct_method) {
1710 #ifdef DCT_ISLOW_SUPPORTED
1711   case JDCT_ISLOW:
1712 #endif
1713 #ifdef DCT_IFAST_SUPPORTED
1714   case JDCT_IFAST:
1715 #endif
1716 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
1717     if (jsimd_can_convsamp())
1718       fdct->convsamp = jsimd_convsamp;
1719     else
1720       fdct->convsamp = convsamp;
1721 
1722     if (cinfo->master->overshoot_deringing) {
1723       fdct->preprocess = preprocess_deringing;
1724     } else {
1725       fdct->preprocess = NULL;
1726     }
1727 
1728     if (jsimd_can_quantize())
1729       fdct->quantize = jsimd_quantize;
1730     else
1731       fdct->quantize = quantize;
1732     break;
1733 #endif
1734 #ifdef DCT_FLOAT_SUPPORTED
1735   case JDCT_FLOAT:
1736     if (jsimd_can_convsamp_float())
1737       fdct->float_convsamp = jsimd_convsamp_float;
1738     else
1739       fdct->float_convsamp = convsamp_float;
1740 
1741     if (cinfo->master->overshoot_deringing) {
1742       fdct->float_preprocess = float_preprocess_deringing;
1743     } else {
1744       fdct->float_preprocess = NULL;
1745     }
1746 
1747     if (jsimd_can_quantize_float())
1748       fdct->float_quantize = jsimd_quantize_float;
1749     else
1750       fdct->float_quantize = quantize_float;
1751     break;
1752 #endif
1753   default:
1754     ERREXIT(cinfo, JERR_NOT_COMPILED);
1755     break;
1756   }
1757 
1758   /* Allocate workspace memory */
1759 #ifdef DCT_FLOAT_SUPPORTED
1760   if (cinfo->dct_method == JDCT_FLOAT)
1761     fdct->float_workspace = (FAST_FLOAT *)
1762       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1763                                   sizeof(FAST_FLOAT) * DCTSIZE2);
1764   else
1765 #endif
1766     fdct->workspace = (DCTELEM *)
1767       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1768                                   sizeof(DCTELEM) * DCTSIZE2);
1769 
1770   /* Mark divisor tables unallocated */
1771   for (i = 0; i < NUM_QUANT_TBLS; i++) {
1772     fdct->divisors[i] = NULL;
1773 #ifdef DCT_FLOAT_SUPPORTED
1774     fdct->float_divisors[i] = NULL;
1775 #endif
1776   }
1777 }
1778