1 /* Copyright  (C) 2010-2018 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (rjpeg.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /* Modified version of stb_image's JPEG sources. */
24 
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37 
38 enum
39 {
40    RJPEG_DEFAULT = 0, /* only used for req_comp */
41    RJPEG_GREY,
42    RJPEG_GREY_ALPHA,
43    RJPEG_RGB,
44    RJPEG_RGB_ALPHA
45 };
46 
47 enum
48 {
49    RJPEG_SCAN_LOAD = 0,
50    RJPEG_SCAN_TYPE,
51    RJPEG_SCAN_HEADER
52 };
53 
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55                                     int w, int hs);
56 
57 typedef struct
58 {
59    rjpeg_resample_row_func resample;
60    uint8_t *line0;
61    uint8_t *line1;
62    int hs,vs;   /* expansion factor in each axis */
63    int w_lores; /* horizontal pixels pre-expansion */
64    int ystep;   /* how far through vertical expansion we are */
65    int ypos;    /* which pre-expansion row we're on */
66 } rjpeg__resample;
67 
68 struct rjpeg
69 {
70    uint8_t *buff_data;
71 };
72 
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76 
77 #ifdef RJPEG_HAS_LROTL
78    #define rjpeg_lrot(x,y)  _lrotl(x,y)
79 #else
80    #define rjpeg_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82 
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG__X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG__X86_TARGET
88 #endif
89 
90 #if defined(__GNUC__) && (defined(RJPEG__X86_TARGET) || defined(RJPEG__X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92  * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93  * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94  * this is just broken and gcc are jerks for not fixing it properly
95  * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96  */
97 #define RJPEG_NO_SIMD
98 #endif
99 
100 #if defined(__MINGW32__) && defined(RJPEG__X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG__X64_TARGET
102  *
103  * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104  * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105  * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106  * simultaneously enabling "-mstackrealign".
107  *
108  * See https://github.com/nothings/stb/issues/81 for more information.
109  *
110  * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111  * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112  */
113 #define RJPEG_NO_SIMD
114 #endif
115 
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118 
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124 
125 #endif
126 
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131 
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137 
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141 
142 typedef struct
143 {
144    uint32_t img_x;
145    uint32_t img_y;
146    int      img_n;
147    int      img_out_n;
148 
149    int      buflen;
150    uint8_t  buffer_start[128];
151 
152    uint8_t *img_buffer;
153    uint8_t *img_buffer_end;
154    uint8_t *img_buffer_original;
155 } rjpeg__context;
156 
rjpeg__get8(rjpeg__context * s)157 static INLINE uint8_t rjpeg__get8(rjpeg__context *s)
158 {
159    if (s->img_buffer < s->img_buffer_end)
160       return *s->img_buffer++;
161 
162    return 0;
163 }
164 
165 #define RJPEG__AT_EOF(s)    ((s)->img_buffer >= (s)->img_buffer_end)
166 
167 #define RJPEG__GET16BE(s)   ((rjpeg__get8((s)) << 8) + rjpeg__get8((s)))
168 
169 #define RJPEG__BYTECAST(x)  ((uint8_t) ((x) & 255))  /* truncate int to byte without warnings */
170 
171 /* huffman decoding acceleration */
172 #define FAST_BITS   9  /* larger handles more cases; smaller stomps less cache */
173 
174 typedef struct
175 {
176    uint8_t  fast[1 << FAST_BITS];
177    /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
178    uint16_t code[256];
179    uint8_t  values[256];
180    uint8_t  size[257];
181    unsigned int maxcode[18];
182    int    delta[17];   /* old 'firstsymbol' - old 'firstcode' */
183 } rjpeg__huffman;
184 
185 typedef struct
186 {
187    rjpeg__context *s;
188    rjpeg__huffman huff_dc[4];
189    rjpeg__huffman huff_ac[4];
190    uint8_t dequant[4][64];
191    int16_t fast_ac[4][1 << FAST_BITS];
192 
193    /* sizes for components, interleaved MCUs */
194    int img_h_max, img_v_max;
195    int img_mcu_x, img_mcu_y;
196    int img_mcu_w, img_mcu_h;
197 
198    /* definition of jpeg image component */
199    struct
200    {
201       int id;
202       int h,v;
203       int tq;
204       int hd,ha;
205       int dc_pred;
206 
207       int x,y,w2,h2;
208       uint8_t *data;
209       void *raw_data, *raw_coeff;
210       uint8_t *linebuf;
211       short   *coeff;            /* progressive only */
212       int      coeff_w;          /* number of 8x8 coefficient blocks */
213       int      coeff_h;          /* number of 8x8 coefficient blocks */
214    } img_comp[4];
215 
216    uint32_t       code_buffer;   /* jpeg entropy-coded buffer */
217    int            code_bits;     /* number of valid bits */
218    unsigned char  marker;        /* marker seen while filling entropy buffer */
219    int            nomore;        /* flag if we saw a marker so must stop */
220 
221    int            progressive;
222    int            spec_start;
223    int            spec_end;
224    int            succ_high;
225    int            succ_low;
226    int            eob_run;
227 
228    int scan_n, order[4];
229    int restart_interval, todo;
230 
231    /* kernels */
232    void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
233    void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
234          const uint8_t *pcr, int count, int step);
235    uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
236          uint8_t *in_far, int w, int hs);
237 } rjpeg__jpeg;
238 
239 #define rjpeg__f2f(x)  ((int) (((x) * 4096 + 0.5)))
240 #define rjpeg__fsh(x)  ((x) << 12)
241 
242 #define RJPEG__MARKER_NONE  0xff
243 /* if there's a pending marker from the entropy stream, return that
244  * otherwise, fetch from the stream and get a marker. if there's no
245  * marker, return 0xff, which is never a valid marker value
246  */
247 
248 /* in each scan, we'll have scan_n components, and the order
249  * of the components is specified by order[]
250  */
251 #define RJPEG__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
252 
253 #define JPEG_MARKER           0xFF
254 #define JPEG_MARKER_SOI       0xD8
255 #define JPEG_MARKER_SOS       0xDA
256 #define JPEG_MARKER_EOI       0xD9
257 #define JPEG_MARKER_APP1      0xE1
258 #define JPEG_MARKER_APP2      0xE2
259 
260 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
261 #define rjpeg__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
262 
263 #define rjpeg__SOF_progressive(x)   ((x) == 0xc2)
264 #define rjpeg__div4(x)              ((uint8_t) ((x) >> 2))
265 #define rjpeg__div16(x)             ((uint8_t) ((x) >> 4))
266 
rjpeg__build_huffman(rjpeg__huffman * h,int * count)267 static int rjpeg__build_huffman(rjpeg__huffman *h, int *count)
268 {
269    int i,j,k=0,code;
270 
271    /* build size list for each symbol (from JPEG spec) */
272    for (i=0; i < 16; ++i)
273       for (j=0; j < count[i]; ++j)
274          h->size[k++] = (uint8_t) (i+1);
275 
276    h->size[k] = 0;
277    /* compute actual symbols (from jpeg spec) */
278    code       = 0;
279    k          = 0;
280 
281    for(j=1; j <= 16; ++j)
282    {
283       /* compute delta to add to code to compute symbol id */
284       h->delta[j] = k - code;
285       if (h->size[k] == j)
286       {
287          while (h->size[k] == j)
288             h->code[k++] = (uint16_t) (code++);
289 
290          /* Bad code lengths, corrupt JPEG? */
291          if (code-1 >= (1 << j))
292             return 0;
293       }
294       /* compute largest code + 1 for this size, preshifted as needed later */
295       h->maxcode[j] = code << (16-j);
296       code <<= 1;
297    }
298    h->maxcode[j] = 0xffffffff;
299 
300    /* build non-spec acceleration table; 255 is flag for not-accelerated */
301    memset(h->fast, 255, 1 << FAST_BITS);
302    for (i=0; i < k; ++i)
303    {
304       int s = h->size[i];
305       if (s <= FAST_BITS)
306       {
307          int c = h->code[i] << (FAST_BITS-s);
308          int m = 1 << (FAST_BITS-s);
309          for (j=0; j < m; ++j)
310             h->fast[c+j] = (uint8_t) i;
311       }
312    }
313    return 1;
314 }
315 
316 /* build a table that decodes both magnitude and value of small ACs in
317  * one go. */
rjpeg__build_fast_ac(int16_t * fast_ac,rjpeg__huffman * h)318 static void rjpeg__build_fast_ac(int16_t *fast_ac, rjpeg__huffman *h)
319 {
320    int i;
321 
322    for (i=0; i < (1 << FAST_BITS); ++i)
323    {
324       uint8_t fast = h->fast[i];
325 
326       fast_ac[i] = 0;
327 
328       if (fast < 255)
329       {
330          int rs      = h->values[fast];
331          int run     = (rs >> 4) & 15;
332          int magbits = rs & 15;
333          int len     = h->size[fast];
334 
335          if (magbits && len + magbits <= FAST_BITS)
336          {
337             /* magnitude code followed by receive_extend code */
338             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
339             int m = 1 << (magbits - 1);
340             if (k < m)
341                k += (-1 << magbits) + 1;
342 
343             /* if the result is small enough, we can fit it in fast_ac table */
344             if (k >= -128 && k <= 127)
345                fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
346          }
347       }
348    }
349 }
350 
rjpeg__grow_buffer_unsafe(rjpeg__jpeg * j)351 static void rjpeg__grow_buffer_unsafe(rjpeg__jpeg *j)
352 {
353    do
354    {
355       int b = j->nomore ? 0 : rjpeg__get8(j->s);
356       if (b == 0xff)
357       {
358          int c = rjpeg__get8(j->s);
359 
360          if (c != 0)
361          {
362             j->marker = (unsigned char) c;
363             j->nomore = 1;
364             return;
365          }
366       }
367       j->code_buffer |= b << (24 - j->code_bits);
368       j->code_bits   += 8;
369    } while (j->code_bits <= 24);
370 }
371 
372 /* (1 << n) - 1 */
373 static uint32_t rjpeg__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
374 
375 /* decode a JPEG huffman value from the bitstream */
rjpeg__jpeg_huff_decode(rjpeg__jpeg * j,rjpeg__huffman * h)376 static INLINE int rjpeg__jpeg_huff_decode(rjpeg__jpeg *j, rjpeg__huffman *h)
377 {
378    unsigned int temp;
379    int c,k;
380 
381    if (j->code_bits < 16)
382       rjpeg__grow_buffer_unsafe(j);
383 
384    /* look at the top FAST_BITS and determine what symbol ID it is,
385     * if the code is <= FAST_BITS */
386    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
387    k = h->fast[c];
388 
389    if (k < 255)
390    {
391       int s = h->size[k];
392       if (s > j->code_bits)
393          return -1;
394       j->code_buffer <<= s;
395       j->code_bits -= s;
396       return h->values[k];
397    }
398 
399    /* naive test is to shift the code_buffer down so k bits are
400     * valid, then test against maxcode. To speed this up, we've
401     * preshifted maxcode left so that it has (16-k) 0s at the
402     * end; in other words, regardless of the number of bits, it
403     * wants to be compared against something shifted to have 16;
404     * that way we don't need to shift inside the loop. */
405    temp = j->code_buffer >> 16;
406    for (k=FAST_BITS+1 ; ; ++k)
407       if (temp < h->maxcode[k])
408          break;
409 
410    if (k == 17)
411    {
412       /* error! code not found */
413       j->code_bits -= 16;
414       return -1;
415    }
416 
417    if (k > j->code_bits)
418       return -1;
419 
420    /* convert the huffman code to the symbol id */
421    c = ((j->code_buffer >> (32 - k)) & rjpeg__bmask[k]) + h->delta[k];
422    retro_assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg__bmask[h->size[c]]) == h->code[c]);
423 
424    /* convert the id to a symbol */
425    j->code_bits -= k;
426    j->code_buffer <<= k;
427    return h->values[c];
428 }
429 
430 /* bias[n] = (-1<<n) + 1 */
431 static int const rjpeg__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
432 
433 /* combined JPEG 'receive' and JPEG 'extend', since baseline
434  * always extends everything it receives. */
rjpeg__extend_receive(rjpeg__jpeg * j,int n)435 static INLINE int rjpeg__extend_receive(rjpeg__jpeg *j, int n)
436 {
437    unsigned int k;
438    int sgn;
439    if (j->code_bits < n)
440       rjpeg__grow_buffer_unsafe(j);
441 
442    sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
443    k = rjpeg_lrot(j->code_buffer, n);
444    retro_assert(n >= 0 && n < (int) (sizeof(rjpeg__bmask)/sizeof(*rjpeg__bmask)));
445    j->code_buffer = k & ~rjpeg__bmask[n];
446    k &= rjpeg__bmask[n];
447    j->code_bits -= n;
448    return k + (rjpeg__jbias[n] & ~sgn);
449 }
450 
451 /* get some unsigned bits */
rjpeg__jpeg_get_bits(rjpeg__jpeg * j,int n)452 static INLINE int rjpeg__jpeg_get_bits(rjpeg__jpeg *j, int n)
453 {
454    unsigned int k;
455    if (j->code_bits < n)
456       rjpeg__grow_buffer_unsafe(j);
457    k = rjpeg_lrot(j->code_buffer, n);
458    j->code_buffer = k & ~rjpeg__bmask[n];
459    k &= rjpeg__bmask[n];
460    j->code_bits -= n;
461    return k;
462 }
463 
rjpeg__jpeg_get_bit(rjpeg__jpeg * j)464 static INLINE int rjpeg__jpeg_get_bit(rjpeg__jpeg *j)
465 {
466    unsigned int k;
467    if (j->code_bits < 1)
468       rjpeg__grow_buffer_unsafe(j);
469 
470    k = j->code_buffer;
471    j->code_buffer <<= 1;
472    --j->code_bits;
473    return k & 0x80000000;
474 }
475 
476 /* given a value that's at position X in the zigzag stream,
477  * where does it appear in the 8x8 matrix coded as row-major? */
478 static uint8_t rjpeg__jpeg_dezigzag[64+15] =
479 {
480     0,  1,  8, 16,  9,  2,  3, 10,
481    17, 24, 32, 25, 18, 11,  4,  5,
482    12, 19, 26, 33, 40, 48, 41, 34,
483    27, 20, 13,  6,  7, 14, 21, 28,
484    35, 42, 49, 56, 57, 50, 43, 36,
485    29, 22, 15, 23, 30, 37, 44, 51,
486    58, 59, 52, 45, 38, 31, 39, 46,
487    53, 60, 61, 54, 47, 55, 62, 63,
488    /* let corrupt input sample past end */
489    63, 63, 63, 63, 63, 63, 63, 63,
490    63, 63, 63, 63, 63, 63, 63
491 };
492 
493 /* decode one 64-entry block-- */
rjpeg__jpeg_decode_block(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,rjpeg__huffman * hac,int16_t * fac,int b,uint8_t * dequant)494 static int rjpeg__jpeg_decode_block(
495       rjpeg__jpeg *j, short data[64],
496       rjpeg__huffman *hdc,
497       rjpeg__huffman *hac,
498       int16_t *fac,
499       int b,
500       uint8_t *dequant)
501 {
502    int diff,dc,k;
503    int t;
504 
505    if (j->code_bits < 16)
506       rjpeg__grow_buffer_unsafe(j);
507    t = rjpeg__jpeg_huff_decode(j, hdc);
508 
509    /* Bad huffman code. Corrupt JPEG? */
510    if (t < 0)
511       return 0;
512 
513    /* 0 all the ac values now so we can do it 32-bits at a time */
514    memset(data,0,64*sizeof(data[0]));
515 
516    diff = t ? rjpeg__extend_receive(j, t) : 0;
517    dc = j->img_comp[b].dc_pred + diff;
518    j->img_comp[b].dc_pred = dc;
519    data[0] = (short) (dc * dequant[0]);
520 
521    /* decode AC components, see JPEG spec */
522    k = 1;
523    do
524    {
525       unsigned int zig;
526       int c,r,s;
527       if (j->code_bits < 16)
528          rjpeg__grow_buffer_unsafe(j);
529       c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
530       r = fac[c];
531       if (r)
532       {
533          /* fast-AC path */
534          k += (r >> 4) & 15; /* run */
535          s = r & 15;         /* combined length */
536          j->code_buffer <<= s;
537          j->code_bits -= s;
538          /* decode into unzigzag'd location */
539          zig = rjpeg__jpeg_dezigzag[k++];
540          data[zig] = (short) ((r >> 8) * dequant[zig]);
541       }
542       else
543       {
544          int rs = rjpeg__jpeg_huff_decode(j, hac);
545 
546          /* Bad huffman code. Corrupt JPEG? */
547          if (rs < 0)
548             return 0;
549 
550          s = rs & 15;
551          r = rs >> 4;
552          if (s == 0)
553          {
554             if (rs != 0xf0)
555                break; /* end block */
556             k += 16;
557          }
558          else
559          {
560             k += r;
561             /* decode into unzigzag'd location */
562             zig = rjpeg__jpeg_dezigzag[k++];
563             data[zig] = (short) (rjpeg__extend_receive(j,s) * dequant[zig]);
564          }
565       }
566    } while (k < 64);
567    return 1;
568 }
569 
rjpeg__jpeg_decode_block_prog_dc(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,int b)570 static int rjpeg__jpeg_decode_block_prog_dc(
571       rjpeg__jpeg *j,
572       short data[64],
573       rjpeg__huffman *hdc,
574       int b)
575 {
576    /* Can't merge DC and AC. Corrupt JPEG? */
577    if (j->spec_end != 0)
578       return 0;
579 
580    if (j->code_bits < 16)
581       rjpeg__grow_buffer_unsafe(j);
582 
583    if (j->succ_high == 0)
584    {
585       int t;
586       int diff,dc;
587 
588       /* first scan for DC coefficient, must be first */
589       memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
590       t = rjpeg__jpeg_huff_decode(j, hdc);
591       diff = t ? rjpeg__extend_receive(j, t) : 0;
592 
593       dc = j->img_comp[b].dc_pred + diff;
594       j->img_comp[b].dc_pred = dc;
595       data[0] = (short) (dc << j->succ_low);
596    }
597    else
598    {
599       /* refinement scan for DC coefficient */
600       if (rjpeg__jpeg_get_bit(j))
601          data[0] += (short) (1 << j->succ_low);
602    }
603    return 1;
604 }
605 
rjpeg__jpeg_decode_block_prog_ac(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hac,int16_t * fac)606 static int rjpeg__jpeg_decode_block_prog_ac(
607       rjpeg__jpeg *j,
608       short data[64],
609       rjpeg__huffman *hac,
610       int16_t *fac)
611 {
612    int k;
613 
614    /* Can't merge DC and AC. Corrupt JPEG? */
615    if (j->spec_start == 0)
616       return 0;
617 
618    if (j->succ_high == 0)
619    {
620       int shift = j->succ_low;
621 
622       if (j->eob_run)
623       {
624          --j->eob_run;
625          return 1;
626       }
627 
628       k = j->spec_start;
629       do
630       {
631          unsigned int zig;
632          int c,r,s;
633          if (j->code_bits < 16) rjpeg__grow_buffer_unsafe(j);
634          c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
635          r = fac[c];
636          if (r)
637          { /* fast-AC path */
638             k += (r >> 4) & 15; /* run */
639             s = r & 15;         /* combined length */
640             j->code_buffer <<= s;
641             j->code_bits -= s;
642             zig = rjpeg__jpeg_dezigzag[k++];
643             data[zig] = (short) ((r >> 8) << shift);
644          }
645          else
646          {
647             int rs = rjpeg__jpeg_huff_decode(j, hac);
648 
649             /* Bad huffman code. Corrupt JPEG? */
650             if (rs < 0)
651                return 0;
652 
653             s = rs & 15;
654             r = rs >> 4;
655             if (s == 0)
656             {
657                if (r < 15)
658                {
659                   j->eob_run = (1 << r);
660                   if (r)
661                      j->eob_run += rjpeg__jpeg_get_bits(j, r);
662                   --j->eob_run;
663                   break;
664                }
665                k += 16;
666             }
667             else
668             {
669                k += r;
670                zig = rjpeg__jpeg_dezigzag[k++];
671                data[zig] = (short) (rjpeg__extend_receive(j,s) << shift);
672             }
673          }
674       } while (k <= j->spec_end);
675    }
676    else
677    {
678       /* refinement scan for these AC coefficients */
679 
680       short bit = (short) (1 << j->succ_low);
681 
682       if (j->eob_run)
683       {
684          --j->eob_run;
685          for (k = j->spec_start; k <= j->spec_end; ++k)
686          {
687             short *p = &data[rjpeg__jpeg_dezigzag[k]];
688             if (*p != 0)
689                if (rjpeg__jpeg_get_bit(j))
690                   if ((*p & bit)==0)
691                   {
692                      if (*p > 0)
693                         *p += bit;
694                      else
695                         *p -= bit;
696                   }
697          }
698       }
699       else
700       {
701          k = j->spec_start;
702          do
703          {
704             int r,s;
705             int rs = rjpeg__jpeg_huff_decode(j, hac);
706 
707             /* Bad huffman code. Corrupt JPEG? */
708             if (rs < 0)
709                return 0;
710 
711             s = rs & 15;
712             r = rs >> 4;
713             if (s == 0)
714             {
715                if (r < 15)
716                {
717                   j->eob_run = (1 << r) - 1;
718                   if (r)
719                      j->eob_run += rjpeg__jpeg_get_bits(j, r);
720                   r = 64; /* force end of block */
721                }
722                else
723                {
724                   /* r=15 s=0 should write 16 0s, so we just do
725                    * a run of 15 0s and then write s (which is 0),
726                    * so we don't have to do anything special here */
727                }
728             }
729             else
730             {
731                /* Bad huffman code. Corrupt JPEG? */
732                if (s != 1)
733                   return 0;
734 
735                /* sign bit */
736                if (rjpeg__jpeg_get_bit(j))
737                   s = bit;
738                else
739                   s = -bit;
740             }
741 
742             /* advance by r */
743             while (k <= j->spec_end)
744             {
745                short *p = &data[rjpeg__jpeg_dezigzag[k++]];
746                if (*p != 0)
747                {
748                   if (rjpeg__jpeg_get_bit(j))
749                      if ((*p & bit)==0)
750                      {
751                         if (*p > 0)
752                            *p += bit;
753                         else
754                            *p -= bit;
755                      }
756                }
757                else
758                {
759                   if (r == 0)
760                   {
761                      *p = (short) s;
762                      break;
763                   }
764                   --r;
765                }
766             }
767          } while (k <= j->spec_end);
768       }
769    }
770    return 1;
771 }
772 
773 /* take a -128..127 value and rjpeg__clamp it and convert to 0..255 */
rjpeg__clamp(int x)774 static INLINE uint8_t rjpeg__clamp(int x)
775 {
776    /* trick to use a single test to catch both cases */
777    if ((unsigned int) x > 255)
778       return 255;
779    return (uint8_t) x;
780 }
781 
782 /* derived from jidctint -- DCT_ISLOW */
783 #define RJPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
784    int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
785    p2 = s2;                                    \
786    p3 = s6;                                    \
787    p1 = (p2+p3) * rjpeg__f2f(0.5411961f);       \
788    t2 = p1 + p3*rjpeg__f2f(-1.847759065f);      \
789    t3 = p1 + p2*rjpeg__f2f( 0.765366865f);      \
790    p2 = s0;                                    \
791    p3 = s4;                                    \
792    t0 = rjpeg__fsh(p2+p3);                      \
793    t1 = rjpeg__fsh(p2-p3);                      \
794    x0 = t0+t3;                                 \
795    x3 = t0-t3;                                 \
796    x1 = t1+t2;                                 \
797    x2 = t1-t2;                                 \
798    t0 = s7;                                    \
799    t1 = s5;                                    \
800    t2 = s3;                                    \
801    t3 = s1;                                    \
802    p3 = t0+t2;                                 \
803    p4 = t1+t3;                                 \
804    p1 = t0+t3;                                 \
805    p2 = t1+t2;                                 \
806    p5 = (p3+p4)*rjpeg__f2f( 1.175875602f);      \
807    t0 = t0*rjpeg__f2f( 0.298631336f);           \
808    t1 = t1*rjpeg__f2f( 2.053119869f);           \
809    t2 = t2*rjpeg__f2f( 3.072711026f);           \
810    t3 = t3*rjpeg__f2f( 1.501321110f);           \
811    p1 = p5 + p1*rjpeg__f2f(-0.899976223f);      \
812    p2 = p5 + p2*rjpeg__f2f(-2.562915447f);      \
813    p3 = p3*rjpeg__f2f(-1.961570560f);           \
814    p4 = p4*rjpeg__f2f(-0.390180644f);           \
815    t3 += p1+p4;                                \
816    t2 += p2+p3;                                \
817    t1 += p2+p4;                                \
818    t0 += p1+p3;
819 
rjpeg__idct_block(uint8_t * out,int out_stride,short data[64])820 static void rjpeg__idct_block(uint8_t *out, int out_stride, short data[64])
821 {
822    int i,val[64],*v=val;
823    uint8_t   *o = NULL;
824    int16_t   *d = data;
825 
826    /* columns */
827    for (i=0; i < 8; ++i,++d, ++v)
828    {
829       /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
830       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
831            && d[40]==0 && d[48]==0 && d[56]==0)
832       {
833          /*    no shortcut                 0     seconds
834           *    (1|2|3|4|5|6|7)==0          0     seconds
835           *    all separate               -0.047 seconds
836           *    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds */
837          int dcterm = d[0] << 2;
838          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
839       }
840       else
841       {
842          RJPEG__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
843          /* constants scaled things up by 1<<12; let's bring them back
844           * down, but keep 2 extra bits of precision */
845          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
846          v[ 0] = (x0+t3) >> 10;
847          v[56] = (x0-t3) >> 10;
848          v[ 8] = (x1+t2) >> 10;
849          v[48] = (x1-t2) >> 10;
850          v[16] = (x2+t1) >> 10;
851          v[40] = (x2-t1) >> 10;
852          v[24] = (x3+t0) >> 10;
853          v[32] = (x3-t0) >> 10;
854       }
855    }
856 
857    for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
858    {
859       /* no fast case since the first 1D IDCT spread components out */
860       RJPEG__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
861          /* constants scaled things up by 1<<12, plus we had 1<<2 from first
862           * loop, plus horizontal and vertical each scale by sqrt(8) so together
863           * we've got an extra 1<<3, so 1<<17 total we need to remove.
864           * so we want to round that, which means adding 0.5 * 1<<17,
865           * aka 65536. Also, we'll end up with -128 to 127 that we want
866           * to encode as 0..255 by adding 128, so we'll add that before the shift */
867          x0 += 65536 + (128<<17);
868       x1 += 65536 + (128<<17);
869       x2 += 65536 + (128<<17);
870       x3 += 65536 + (128<<17);
871       /* tried computing the shifts into temps, or'ing the temps to see
872        * if any were out of range, but that was slower */
873       o[0] = rjpeg__clamp((x0+t3) >> 17);
874       o[7] = rjpeg__clamp((x0-t3) >> 17);
875       o[1] = rjpeg__clamp((x1+t2) >> 17);
876       o[6] = rjpeg__clamp((x1-t2) >> 17);
877       o[2] = rjpeg__clamp((x2+t1) >> 17);
878       o[5] = rjpeg__clamp((x2-t1) >> 17);
879       o[3] = rjpeg__clamp((x3+t0) >> 17);
880       o[4] = rjpeg__clamp((x3-t0) >> 17);
881    }
882 }
883 
884 #if defined(__SSE2__)
885 /* sse2 integer IDCT. not the fastest possible implementation but it
886  * produces bit-identical results to the generic C version so it's
887  * fully "transparent".
888  */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])889 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
890 {
891    /* This is constructed to match our regular (generic) integer IDCT exactly. */
892    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
893    __m128i tmp;
894 
895    /* dot product constant: even elems=x, odd elems=y */
896    #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
897 
898    /* out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
899     * out(1) = c1[even]*x + c1[odd]*y
900     */
901    #define dct_rot(out0,out1, x,y,c0,c1) \
902       __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
903       __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
904       __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
905       __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
906       __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
907       __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
908 
909    /* out = in << 12  (in 16-bit, out 32-bit) */
910    #define dct_widen(out, in) \
911       __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
912       __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
913 
914    /* wide add */
915    #define dct_wadd(out, a, b) \
916       __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
917       __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
918 
919    /* wide sub */
920    #define dct_wsub(out, a, b) \
921       __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
922       __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
923 
924    /* butterfly a/b, add bias, then shift by "s" and pack */
925    #define dct_bfly32o(out0, out1, a,b,bias,s) \
926       { \
927          __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
928          __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
929          dct_wadd(sum, abiased, b); \
930          dct_wsub(dif, abiased, b); \
931          out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
932          out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
933       }
934 
935    /* 8-bit interleave step (for transposes) */
936    #define dct_interleave8(a, b) \
937       tmp = a; \
938       a = _mm_unpacklo_epi8(a, b); \
939       b = _mm_unpackhi_epi8(tmp, b)
940 
941    /* 16-bit interleave step (for transposes) */
942    #define dct_interleave16(a, b) \
943       tmp = a; \
944       a = _mm_unpacklo_epi16(a, b); \
945       b = _mm_unpackhi_epi16(tmp, b)
946 
947    #define dct_pass(bias,shift) \
948       { \
949          /* even part */ \
950          dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
951          __m128i sum04 = _mm_add_epi16(row0, row4); \
952          __m128i dif04 = _mm_sub_epi16(row0, row4); \
953          dct_widen(t0e, sum04); \
954          dct_widen(t1e, dif04); \
955          dct_wadd(x0, t0e, t3e); \
956          dct_wsub(x3, t0e, t3e); \
957          dct_wadd(x1, t1e, t2e); \
958          dct_wsub(x2, t1e, t2e); \
959          /* odd part */ \
960          dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
961          dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
962          __m128i sum17 = _mm_add_epi16(row1, row7); \
963          __m128i sum35 = _mm_add_epi16(row3, row5); \
964          dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
965          dct_wadd(x4, y0o, y4o); \
966          dct_wadd(x5, y1o, y5o); \
967          dct_wadd(x6, y2o, y5o); \
968          dct_wadd(x7, y3o, y4o); \
969          dct_bfly32o(row0,row7, x0,x7,bias,shift); \
970          dct_bfly32o(row1,row6, x1,x6,bias,shift); \
971          dct_bfly32o(row2,row5, x2,x5,bias,shift); \
972          dct_bfly32o(row3,row4, x3,x4,bias,shift); \
973       }
974 
975    __m128i rot0_0 = dct_const(rjpeg__f2f(0.5411961f), rjpeg__f2f(0.5411961f) + rjpeg__f2f(-1.847759065f));
976    __m128i rot0_1 = dct_const(rjpeg__f2f(0.5411961f) + rjpeg__f2f( 0.765366865f), rjpeg__f2f(0.5411961f));
977    __m128i rot1_0 = dct_const(rjpeg__f2f(1.175875602f) + rjpeg__f2f(-0.899976223f), rjpeg__f2f(1.175875602f));
978    __m128i rot1_1 = dct_const(rjpeg__f2f(1.175875602f), rjpeg__f2f(1.175875602f) + rjpeg__f2f(-2.562915447f));
979    __m128i rot2_0 = dct_const(rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 0.298631336f), rjpeg__f2f(-1.961570560f));
980    __m128i rot2_1 = dct_const(rjpeg__f2f(-1.961570560f), rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 3.072711026f));
981    __m128i rot3_0 = dct_const(rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 2.053119869f), rjpeg__f2f(-0.390180644f));
982    __m128i rot3_1 = dct_const(rjpeg__f2f(-0.390180644f), rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 1.501321110f));
983 
984    /* rounding biases in column/row passes, see rjpeg__idct_block for explanation. */
985    __m128i bias_0 = _mm_set1_epi32(512);
986    __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
987 
988    /* load */
989    row0 = _mm_load_si128((const __m128i *) (data + 0*8));
990    row1 = _mm_load_si128((const __m128i *) (data + 1*8));
991    row2 = _mm_load_si128((const __m128i *) (data + 2*8));
992    row3 = _mm_load_si128((const __m128i *) (data + 3*8));
993    row4 = _mm_load_si128((const __m128i *) (data + 4*8));
994    row5 = _mm_load_si128((const __m128i *) (data + 5*8));
995    row6 = _mm_load_si128((const __m128i *) (data + 6*8));
996    row7 = _mm_load_si128((const __m128i *) (data + 7*8));
997 
998    /* column pass */
999    dct_pass(bias_0, 10);
1000 
1001    {
1002       /* 16bit 8x8 transpose pass 1 */
1003       dct_interleave16(row0, row4);
1004       dct_interleave16(row1, row5);
1005       dct_interleave16(row2, row6);
1006       dct_interleave16(row3, row7);
1007 
1008       /* transpose pass 2 */
1009       dct_interleave16(row0, row2);
1010       dct_interleave16(row1, row3);
1011       dct_interleave16(row4, row6);
1012       dct_interleave16(row5, row7);
1013 
1014       /* transpose pass 3 */
1015       dct_interleave16(row0, row1);
1016       dct_interleave16(row2, row3);
1017       dct_interleave16(row4, row5);
1018       dct_interleave16(row6, row7);
1019    }
1020 
1021    /* row pass */
1022    dct_pass(bias_1, 17);
1023 
1024    {
1025       /* pack */
1026       __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1027       __m128i p1 = _mm_packus_epi16(row2, row3);
1028       __m128i p2 = _mm_packus_epi16(row4, row5);
1029       __m128i p3 = _mm_packus_epi16(row6, row7);
1030 
1031       /* 8bit 8x8 transpose pass 1 */
1032       dct_interleave8(p0, p2); /* a0e0a1e1... */
1033       dct_interleave8(p1, p3); /* c0g0c1g1... */
1034 
1035       /* transpose pass 2 */
1036       dct_interleave8(p0, p1); /* a0c0e0g0... */
1037       dct_interleave8(p2, p3); /* b0d0f0h0... */
1038 
1039       /* transpose pass 3 */
1040       dct_interleave8(p0, p2); /* a0b0c0d0... */
1041       dct_interleave8(p1, p3); /* a4b4c4d4... */
1042 
1043       /* store */
1044       _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1045       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1046       _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1047       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1048       _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1049       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1050       _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1051       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1052    }
1053 
1054 #undef dct_const
1055 #undef dct_rot
1056 #undef dct_widen
1057 #undef dct_wadd
1058 #undef dct_wsub
1059 #undef dct_bfly32o
1060 #undef dct_interleave8
1061 #undef dct_interleave16
1062 #undef dct_pass
1063 }
1064 
1065 #endif
1066 
1067 #ifdef RJPEG_NEON
1068 
1069 /* NEON integer IDCT. should produce bit-identical
1070  * results to the generic C version. */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])1071 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
1072 {
1073    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1074 
1075    int16x4_t rot0_0 = vdup_n_s16(rjpeg__f2f(0.5411961f));
1076    int16x4_t rot0_1 = vdup_n_s16(rjpeg__f2f(-1.847759065f));
1077    int16x4_t rot0_2 = vdup_n_s16(rjpeg__f2f( 0.765366865f));
1078    int16x4_t rot1_0 = vdup_n_s16(rjpeg__f2f( 1.175875602f));
1079    int16x4_t rot1_1 = vdup_n_s16(rjpeg__f2f(-0.899976223f));
1080    int16x4_t rot1_2 = vdup_n_s16(rjpeg__f2f(-2.562915447f));
1081    int16x4_t rot2_0 = vdup_n_s16(rjpeg__f2f(-1.961570560f));
1082    int16x4_t rot2_1 = vdup_n_s16(rjpeg__f2f(-0.390180644f));
1083    int16x4_t rot3_0 = vdup_n_s16(rjpeg__f2f( 0.298631336f));
1084    int16x4_t rot3_1 = vdup_n_s16(rjpeg__f2f( 2.053119869f));
1085    int16x4_t rot3_2 = vdup_n_s16(rjpeg__f2f( 3.072711026f));
1086    int16x4_t rot3_3 = vdup_n_s16(rjpeg__f2f( 1.501321110f));
1087 
1088 #define dct_long_mul(out, inq, coeff) \
1089    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1090    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1091 
1092 #define dct_long_mac(out, acc, inq, coeff) \
1093    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1094    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1095 
1096 #define dct_widen(out, inq) \
1097    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1098    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1099 
1100 /* wide add */
1101 #define dct_wadd(out, a, b) \
1102    int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1103    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1104 
1105 /* wide sub */
1106 #define dct_wsub(out, a, b) \
1107    int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1108    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1109 
1110 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1111 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1112    { \
1113       dct_wadd(sum, a, b); \
1114       dct_wsub(dif, a, b); \
1115       out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1116       out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1117    }
1118 
1119 #define dct_pass(shiftop, shift) \
1120    { \
1121       /* even part */ \
1122       int16x8_t sum26 = vaddq_s16(row2, row6); \
1123       dct_long_mul(p1e, sum26, rot0_0); \
1124       dct_long_mac(t2e, p1e, row6, rot0_1); \
1125       dct_long_mac(t3e, p1e, row2, rot0_2); \
1126       int16x8_t sum04 = vaddq_s16(row0, row4); \
1127       int16x8_t dif04 = vsubq_s16(row0, row4); \
1128       dct_widen(t0e, sum04); \
1129       dct_widen(t1e, dif04); \
1130       dct_wadd(x0, t0e, t3e); \
1131       dct_wsub(x3, t0e, t3e); \
1132       dct_wadd(x1, t1e, t2e); \
1133       dct_wsub(x2, t1e, t2e); \
1134       /* odd part */ \
1135       int16x8_t sum15 = vaddq_s16(row1, row5); \
1136       int16x8_t sum17 = vaddq_s16(row1, row7); \
1137       int16x8_t sum35 = vaddq_s16(row3, row5); \
1138       int16x8_t sum37 = vaddq_s16(row3, row7); \
1139       int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1140       dct_long_mul(p5o, sumodd, rot1_0); \
1141       dct_long_mac(p1o, p5o, sum17, rot1_1); \
1142       dct_long_mac(p2o, p5o, sum35, rot1_2); \
1143       dct_long_mul(p3o, sum37, rot2_0); \
1144       dct_long_mul(p4o, sum15, rot2_1); \
1145       dct_wadd(sump13o, p1o, p3o); \
1146       dct_wadd(sump24o, p2o, p4o); \
1147       dct_wadd(sump23o, p2o, p3o); \
1148       dct_wadd(sump14o, p1o, p4o); \
1149       dct_long_mac(x4, sump13o, row7, rot3_0); \
1150       dct_long_mac(x5, sump24o, row5, rot3_1); \
1151       dct_long_mac(x6, sump23o, row3, rot3_2); \
1152       dct_long_mac(x7, sump14o, row1, rot3_3); \
1153       dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1154       dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1155       dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1156       dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1157    }
1158 
1159    /* load */
1160    row0 = vld1q_s16(data + 0*8);
1161    row1 = vld1q_s16(data + 1*8);
1162    row2 = vld1q_s16(data + 2*8);
1163    row3 = vld1q_s16(data + 3*8);
1164    row4 = vld1q_s16(data + 4*8);
1165    row5 = vld1q_s16(data + 5*8);
1166    row6 = vld1q_s16(data + 6*8);
1167    row7 = vld1q_s16(data + 7*8);
1168 
1169    /* add DC bias */
1170    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1171 
1172    /* column pass */
1173    dct_pass(vrshrn_n_s32, 10);
1174 
1175    /* 16bit 8x8 transpose */
1176    {
1177 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1178  * whether compilers actually get this is another story, sadly. */
1179 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1180 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1181 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1182 
1183       /* pass 1 */
1184       dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1185       dct_trn16(row2, row3);
1186       dct_trn16(row4, row5);
1187       dct_trn16(row6, row7);
1188 
1189       /* pass 2 */
1190       dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1191       dct_trn32(row1, row3);
1192       dct_trn32(row4, row6);
1193       dct_trn32(row5, row7);
1194 
1195       /* pass 3 */
1196       dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1197       dct_trn64(row1, row5);
1198       dct_trn64(row2, row6);
1199       dct_trn64(row3, row7);
1200 
1201 #undef dct_trn16
1202 #undef dct_trn32
1203 #undef dct_trn64
1204    }
1205 
1206    /* row pass
1207     * vrshrn_n_s32 only supports shifts up to 16, we need
1208     * 17. so do a non-rounding shift of 16 first then follow
1209     * up with a rounding shift by 1. */
1210    dct_pass(vshrn_n_s32, 16);
1211 
1212    {
1213       /* pack and round */
1214       uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1215       uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1216       uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1217       uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1218       uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1219       uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1220       uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1221       uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1222 
1223       /* again, these can translate into one instruction, but often don't. */
1224 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1225 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1226 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1227 
1228       /* sadly can't use interleaved stores here since we only write
1229        * 8 bytes to each scan line! */
1230 
1231       /* 8x8 8-bit transpose pass 1 */
1232       dct_trn8_8(p0, p1);
1233       dct_trn8_8(p2, p3);
1234       dct_trn8_8(p4, p5);
1235       dct_trn8_8(p6, p7);
1236 
1237       /* pass 2 */
1238       dct_trn8_16(p0, p2);
1239       dct_trn8_16(p1, p3);
1240       dct_trn8_16(p4, p6);
1241       dct_trn8_16(p5, p7);
1242 
1243       /* pass 3 */
1244       dct_trn8_32(p0, p4);
1245       dct_trn8_32(p1, p5);
1246       dct_trn8_32(p2, p6);
1247       dct_trn8_32(p3, p7);
1248 
1249       /* store */
1250       vst1_u8(out, p0); out += out_stride;
1251       vst1_u8(out, p1); out += out_stride;
1252       vst1_u8(out, p2); out += out_stride;
1253       vst1_u8(out, p3); out += out_stride;
1254       vst1_u8(out, p4); out += out_stride;
1255       vst1_u8(out, p5); out += out_stride;
1256       vst1_u8(out, p6); out += out_stride;
1257       vst1_u8(out, p7);
1258 
1259 #undef dct_trn8_8
1260 #undef dct_trn8_16
1261 #undef dct_trn8_32
1262    }
1263 
1264 #undef dct_long_mul
1265 #undef dct_long_mac
1266 #undef dct_widen
1267 #undef dct_wadd
1268 #undef dct_wsub
1269 #undef dct_bfly32o
1270 #undef dct_pass
1271 }
1272 
1273 #endif /* RJPEG_NEON */
1274 
rjpeg__get_marker(rjpeg__jpeg * j)1275 static uint8_t rjpeg__get_marker(rjpeg__jpeg *j)
1276 {
1277    uint8_t x;
1278 
1279    if (j->marker != RJPEG__MARKER_NONE)
1280    {
1281       x = j->marker;
1282       j->marker = RJPEG__MARKER_NONE;
1283       return x;
1284    }
1285 
1286    x = rjpeg__get8(j->s);
1287    if (x != 0xff)
1288       return RJPEG__MARKER_NONE;
1289    while (x == 0xff)
1290       x = rjpeg__get8(j->s);
1291    return x;
1292 }
1293 
1294 /* after a restart interval, rjpeg__jpeg_reset the entropy decoder and
1295  * the dc prediction
1296  */
rjpeg__jpeg_reset(rjpeg__jpeg * j)1297 static void rjpeg__jpeg_reset(rjpeg__jpeg *j)
1298 {
1299    j->code_bits           = 0;
1300    j->code_buffer         = 0;
1301    j->nomore              = 0;
1302    j->img_comp[0].dc_pred = 0;
1303    j->img_comp[1].dc_pred = 0;
1304    j->img_comp[2].dc_pred = 0;
1305    j->marker              = RJPEG__MARKER_NONE;
1306    j->todo                = j->restart_interval ? j->restart_interval : 0x7fffffff;
1307    j->eob_run             = 0;
1308 
1309    /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1310     * since we don't even allow 1<<30 pixels */
1311 }
1312 
rjpeg__parse_entropy_coded_data(rjpeg__jpeg * z)1313 static int rjpeg__parse_entropy_coded_data(rjpeg__jpeg *z)
1314 {
1315    rjpeg__jpeg_reset(z);
1316 
1317    if (z->scan_n == 1)
1318    {
1319       int i,j;
1320       int n = z->order[0];
1321       int w = (z->img_comp[n].x+7) >> 3;
1322       int h = (z->img_comp[n].y+7) >> 3;
1323 
1324       /* non-interleaved data, we just need to process one block at a time,
1325        * in trivial scanline order
1326        * number of blocks to do just depends on how many actual "pixels" this
1327        * component has, independent of interleaved MCU blocking and such */
1328 
1329       if (z->progressive)
1330       {
1331          for (j=0; j < h; ++j)
1332          {
1333             for (i=0; i < w; ++i)
1334             {
1335                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1336 
1337                if (z->spec_start == 0)
1338                {
1339                   if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1340                      return 0;
1341                }
1342                else
1343                {
1344                   int ha = z->img_comp[n].ha;
1345                   if (!rjpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1346                      return 0;
1347                }
1348 
1349                /* every data block is an MCU, so countdown the restart interval */
1350                if (--z->todo <= 0)
1351                {
1352                   if (z->code_bits < 24)
1353                      rjpeg__grow_buffer_unsafe(z);
1354 
1355                   if (!RJPEG__RESTART(z->marker))
1356                      return 1;
1357                   rjpeg__jpeg_reset(z);
1358                }
1359             }
1360          }
1361       }
1362       else
1363       {
1364          RJPEG_SIMD_ALIGN(short, data[64]);
1365 
1366          for (j=0; j < h; ++j)
1367          {
1368             for (i=0; i < w; ++i)
1369             {
1370                int ha = z->img_comp[n].ha;
1371                if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1372                         z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1373                   return 0;
1374 
1375                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1376                      z->img_comp[n].w2, data);
1377 
1378                /* every data block is an MCU, so countdown the restart interval */
1379                if (--z->todo <= 0)
1380                {
1381                   if (z->code_bits < 24)
1382                      rjpeg__grow_buffer_unsafe(z);
1383 
1384                   /* if it's NOT a restart, then just bail,
1385                    * so we get corrupt data rather than no data */
1386                   if (!RJPEG__RESTART(z->marker))
1387                      return 1;
1388                   rjpeg__jpeg_reset(z);
1389                }
1390             }
1391          }
1392       }
1393    }
1394    else
1395    {
1396       /* interleaved */
1397       int i,j,k,x,y;
1398 
1399       if (z->progressive)
1400       {
1401          for (j=0; j < z->img_mcu_y; ++j)
1402          {
1403             for (i=0; i < z->img_mcu_x; ++i)
1404             {
1405                /* scan an interleaved MCU... process scan_n components in order */
1406                for (k=0; k < z->scan_n; ++k)
1407                {
1408                   int n = z->order[k];
1409                   /* scan out an MCU's worth of this component; that's just determined
1410                    * by the basic H and V specified for the component */
1411                   for (y=0; y < z->img_comp[n].v; ++y)
1412                   {
1413                      for (x=0; x < z->img_comp[n].h; ++x)
1414                      {
1415                         int x2 = (i*z->img_comp[n].h + x);
1416                         int y2 = (j*z->img_comp[n].v + y);
1417                         short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1418                         if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1419                            return 0;
1420                      }
1421                   }
1422                }
1423 
1424                /* after all interleaved components, that's an interleaved MCU,
1425                 * so now count down the restart interval */
1426                if (--z->todo <= 0)
1427                {
1428                   if (z->code_bits < 24)
1429                      rjpeg__grow_buffer_unsafe(z);
1430                   if (!RJPEG__RESTART(z->marker))
1431                      return 1;
1432                   rjpeg__jpeg_reset(z);
1433                }
1434             }
1435          }
1436       }
1437       else
1438       {
1439          RJPEG_SIMD_ALIGN(short, data[64]);
1440 
1441          for (j=0; j < z->img_mcu_y; ++j)
1442          {
1443             for (i=0; i < z->img_mcu_x; ++i)
1444             {
1445                /* scan an interleaved MCU... process scan_n components in order */
1446                for (k=0; k < z->scan_n; ++k)
1447                {
1448                   int n = z->order[k];
1449                   /* scan out an MCU's worth of this component; that's just determined
1450                    * by the basic H and V specified for the component */
1451                   for (y=0; y < z->img_comp[n].v; ++y)
1452                   {
1453                      for (x=0; x < z->img_comp[n].h; ++x)
1454                      {
1455                         int x2 = (i*z->img_comp[n].h + x)*8;
1456                         int y2 = (j*z->img_comp[n].v + y)*8;
1457                         int ha = z->img_comp[n].ha;
1458 
1459                         if (!rjpeg__jpeg_decode_block(z, data,
1460                                  z->huff_dc+z->img_comp[n].hd,
1461                                  z->huff_ac+ha, z->fast_ac[ha],
1462                                  n, z->dequant[z->img_comp[n].tq]))
1463                            return 0;
1464 
1465                         z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1466                               z->img_comp[n].w2, data);
1467                      }
1468                   }
1469                }
1470 
1471                /* after all interleaved components, that's an interleaved MCU,
1472                 * so now count down the restart interval */
1473                if (--z->todo <= 0)
1474                {
1475                   if (z->code_bits < 24)
1476                      rjpeg__grow_buffer_unsafe(z);
1477                   if (!RJPEG__RESTART(z->marker))
1478                      return 1;
1479                   rjpeg__jpeg_reset(z);
1480                }
1481             }
1482          }
1483       }
1484    }
1485 
1486    return 1;
1487 }
1488 
rjpeg__jpeg_dequantize(short * data,uint8_t * dequant)1489 static void rjpeg__jpeg_dequantize(short *data, uint8_t *dequant)
1490 {
1491    int i;
1492    for (i=0; i < 64; ++i)
1493       data[i] *= dequant[i];
1494 }
1495 
rjpeg__jpeg_finish(rjpeg__jpeg * z)1496 static void rjpeg__jpeg_finish(rjpeg__jpeg *z)
1497 {
1498    int i,j,n;
1499 
1500    if (!z->progressive)
1501       return;
1502 
1503    /* dequantize and IDCT the data */
1504    for (n=0; n < z->s->img_n; ++n)
1505    {
1506       int w = (z->img_comp[n].x+7) >> 3;
1507       int h = (z->img_comp[n].y+7) >> 3;
1508       for (j=0; j < h; ++j)
1509       {
1510          for (i=0; i < w; ++i)
1511          {
1512             short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1513             rjpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1514             z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1515                   z->img_comp[n].w2, data);
1516          }
1517       }
1518    }
1519 }
1520 
rjpeg__process_marker(rjpeg__jpeg * z,int m)1521 static int rjpeg__process_marker(rjpeg__jpeg *z, int m)
1522 {
1523    int L;
1524    switch (m)
1525    {
1526       case RJPEG__MARKER_NONE: /* no marker found */
1527          /* Expected marker. Corrupt JPEG? */
1528          return 0;
1529 
1530       case 0xDD: /* DRI - specify restart interval */
1531 
1532          /* Bad DRI length. Corrupt JPEG? */
1533          if (RJPEG__GET16BE(z->s) != 4)
1534             return 0;
1535 
1536          z->restart_interval = RJPEG__GET16BE(z->s);
1537          return 1;
1538 
1539       case 0xDB: /* DQT - define quantization table */
1540          L = RJPEG__GET16BE(z->s)-2;
1541          while (L > 0)
1542          {
1543             int q = rjpeg__get8(z->s);
1544             int p = q >> 4;
1545             int t = q & 15,i;
1546 
1547             /* Bad DQT type. Corrupt JPEG? */
1548             if (p != 0)
1549                return 0;
1550 
1551             /* Bad DQT table. Corrupt JPEG? */
1552             if (t > 3)
1553                return 0;
1554 
1555             for (i=0; i < 64; ++i)
1556                z->dequant[t][rjpeg__jpeg_dezigzag[i]] = rjpeg__get8(z->s);
1557             L -= 65;
1558          }
1559          return L==0;
1560 
1561       case 0xC4: /* DHT - define huffman table */
1562          L = RJPEG__GET16BE(z->s)-2;
1563          while (L > 0)
1564          {
1565             int sizes[16],i,n=0;
1566             uint8_t *v = NULL;
1567             int q      = rjpeg__get8(z->s);
1568             int tc     = q >> 4;
1569             int th     = q & 15;
1570 
1571             /* Bad DHT header. Corrupt JPEG? */
1572             if (tc > 1 || th > 3)
1573                return 0;
1574 
1575             for (i=0; i < 16; ++i)
1576             {
1577                sizes[i] = rjpeg__get8(z->s);
1578                n += sizes[i];
1579             }
1580             L -= 17;
1581 
1582             if (tc == 0)
1583             {
1584                if (!rjpeg__build_huffman(z->huff_dc+th, sizes))
1585                   return 0;
1586                v = z->huff_dc[th].values;
1587             }
1588             else
1589             {
1590                if (!rjpeg__build_huffman(z->huff_ac+th, sizes))
1591                   return 0;
1592                v = z->huff_ac[th].values;
1593             }
1594             for (i=0; i < n; ++i)
1595                v[i] = rjpeg__get8(z->s);
1596             if (tc != 0)
1597                rjpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1598             L -= n;
1599          }
1600          return L==0;
1601    }
1602 
1603    /* check for comment block or APP blocks */
1604    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1605    {
1606       int n = RJPEG__GET16BE(z->s)-2;
1607 
1608       if (n < 0)
1609          z->s->img_buffer = z->s->img_buffer_end;
1610       else
1611          z->s->img_buffer += n;
1612 
1613       return 1;
1614    }
1615    return 0;
1616 }
1617 
1618 /* after we see SOS */
rjpeg__process_scan_header(rjpeg__jpeg * z)1619 static int rjpeg__process_scan_header(rjpeg__jpeg *z)
1620 {
1621    int i;
1622    int aa;
1623    int Ls    = RJPEG__GET16BE(z->s);
1624 
1625    z->scan_n = rjpeg__get8(z->s);
1626 
1627    /* Bad SOS component count. Corrupt JPEG? */
1628    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1629       return 0;
1630 
1631    /* Bad SOS length. Corrupt JPEG? */
1632    if (Ls != 6+2*z->scan_n)
1633       return 0;
1634 
1635    for (i=0; i < z->scan_n; ++i)
1636    {
1637       int which;
1638       int id = rjpeg__get8(z->s);
1639       int q  = rjpeg__get8(z->s);
1640 
1641       for (which = 0; which < z->s->img_n; ++which)
1642          if (z->img_comp[which].id == id)
1643             break;
1644       if (which == z->s->img_n)
1645          return 0; /* no match */
1646 
1647       /* Bad DC huff. Corrupt JPEG? */
1648       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3)
1649          return 0;
1650 
1651       /* Bad AC huff. Corrupt JPEG? */
1652       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3)
1653          return 0;
1654 
1655       z->order[i] = which;
1656    }
1657 
1658    z->spec_start = rjpeg__get8(z->s);
1659    z->spec_end   = rjpeg__get8(z->s); /* should be 63, but might be 0 */
1660    aa            = rjpeg__get8(z->s);
1661    z->succ_high  = (aa >> 4);
1662    z->succ_low   = (aa & 15);
1663 
1664    if (z->progressive)
1665    {
1666       /* Bad SOS. Corrupt JPEG? */
1667       if (  z->spec_start > 63 ||
1668             z->spec_end > 63   ||
1669             z->spec_start > z->spec_end ||
1670             z->succ_high > 13           ||
1671             z->succ_low > 13)
1672          return 0;
1673    }
1674    else
1675    {
1676       /* Bad SOS. Corrupt JPEG? */
1677       if (z->spec_start != 0)
1678          return 0;
1679       if (z->succ_high != 0 || z->succ_low != 0)
1680          return 0;
1681 
1682       z->spec_end = 63;
1683    }
1684 
1685    return 1;
1686 }
1687 
rjpeg__process_frame_header(rjpeg__jpeg * z,int scan)1688 static int rjpeg__process_frame_header(rjpeg__jpeg *z, int scan)
1689 {
1690    rjpeg__context *s = z->s;
1691    int Lf,p,i,q, h_max=1,v_max=1,c;
1692    Lf = RJPEG__GET16BE(s);
1693 
1694    /* JPEG */
1695 
1696    /* Bad SOF len. Corrupt JPEG? */
1697    if (Lf < 11)
1698       return 0;
1699 
1700    p  = rjpeg__get8(s);
1701 
1702    /* JPEG baseline */
1703 
1704    /* Only 8-bit. JPEG format not supported? */
1705    if (p != 8)
1706       return 0;
1707 
1708    s->img_y = RJPEG__GET16BE(s);
1709 
1710    /* Legal, but we don't handle it--but neither does IJG */
1711 
1712    /* No header height, JPEG format not supported? */
1713    if (s->img_y == 0)
1714       return 0;
1715 
1716    s->img_x = RJPEG__GET16BE(s);
1717 
1718    /* No header width. Corrupt JPEG? */
1719    if (s->img_x == 0)
1720       return 0;
1721 
1722    c = rjpeg__get8(s);
1723 
1724    /* JFIF requires */
1725 
1726    /* Bad component count. Corrupt JPEG? */
1727    if (c != 3 && c != 1)
1728       return 0;
1729 
1730    s->img_n = c;
1731 
1732    for (i=0; i < c; ++i)
1733    {
1734       z->img_comp[i].data = NULL;
1735       z->img_comp[i].linebuf = NULL;
1736    }
1737 
1738    /* Bad SOF length. Corrupt JPEG? */
1739    if (Lf != 8+3*s->img_n)
1740       return 0;
1741 
1742    for (i=0; i < s->img_n; ++i)
1743    {
1744       z->img_comp[i].id = rjpeg__get8(s);
1745       if (z->img_comp[i].id != i+1)   /* JFIF requires */
1746          if (z->img_comp[i].id != i)  /* some version of jpegtran outputs non-JFIF-compliant files! */
1747             return 0;
1748 
1749       q = rjpeg__get8(s);
1750       z->img_comp[i].h = (q >> 4);
1751 
1752       /* Bad H. Corrupt JPEG? */
1753       if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1754          return 0;
1755 
1756       z->img_comp[i].v = q & 15;
1757 
1758       /* Bad V. Corrupt JPEG? */
1759       if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1760          return 0;
1761 
1762       z->img_comp[i].tq = rjpeg__get8(s);
1763 
1764       /* Bad TQ. Corrupt JPEG? */
1765       if (z->img_comp[i].tq > 3)
1766          return 0;
1767    }
1768 
1769    if (scan != RJPEG_SCAN_LOAD)
1770       return 1;
1771 
1772    /* Image too large to decode? */
1773    if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1774       return 0;
1775 
1776    for (i=0; i < s->img_n; ++i)
1777    {
1778       if (z->img_comp[i].h > h_max)
1779          h_max = z->img_comp[i].h;
1780       if (z->img_comp[i].v > v_max)
1781          v_max = z->img_comp[i].v;
1782    }
1783 
1784    /* compute interleaved MCU info */
1785    z->img_h_max = h_max;
1786    z->img_v_max = v_max;
1787    z->img_mcu_w = h_max * 8;
1788    z->img_mcu_h = v_max * 8;
1789    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1790    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1791 
1792    if (z->progressive)
1793    {
1794       for (i=0; i < s->img_n; ++i)
1795       {
1796          /* number of effective pixels (e.g. for non-interleaved MCU) */
1797          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1798          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1799 
1800          /* to simplify generation, we'll allocate enough memory to decode
1801           * the bogus oversized data from using interleaved MCUs and their
1802           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1803           * discard the extra data until colorspace conversion */
1804          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1805          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1806          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1807 
1808          /* Out of memory? */
1809          if (!z->img_comp[i].raw_data)
1810          {
1811             for(--i; i >= 0; --i)
1812             {
1813                free(z->img_comp[i].raw_data);
1814                z->img_comp[i].data = NULL;
1815             }
1816 
1817             return 0;
1818          }
1819 
1820          /* align blocks for IDCT using MMX/SSE */
1821          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1822          z->img_comp[i].linebuf   = NULL;
1823          z->img_comp[i].coeff_w   = (z->img_comp[i].w2 + 7) >> 3;
1824          z->img_comp[i].coeff_h   = (z->img_comp[i].h2 + 7) >> 3;
1825          z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1826                                     z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1827          z->img_comp[i].coeff     = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1828       }
1829    }
1830    else
1831    {
1832       for (i=0; i < s->img_n; ++i)
1833       {
1834          /* number of effective pixels (e.g. for non-interleaved MCU) */
1835          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1836          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1837 
1838          /* to simplify generation, we'll allocate enough memory to decode
1839           * the bogus oversized data from using interleaved MCUs and their
1840           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1841           * discard the extra data until colorspace conversion */
1842          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1843          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1844          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1845 
1846          /* Out of memory? */
1847          if (!z->img_comp[i].raw_data)
1848          {
1849             for(--i; i >= 0; --i)
1850             {
1851                free(z->img_comp[i].raw_data);
1852                z->img_comp[i].data = NULL;
1853             }
1854          }
1855 
1856          /* align blocks for IDCT using MMX/SSE */
1857          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1858          z->img_comp[i].linebuf   = NULL;
1859          z->img_comp[i].coeff     = 0;
1860          z->img_comp[i].raw_coeff = 0;
1861       }
1862    }
1863 
1864    return 1;
1865 }
1866 
rjpeg__decode_jpeg_header(rjpeg__jpeg * z,int scan)1867 static int rjpeg__decode_jpeg_header(rjpeg__jpeg *z, int scan)
1868 {
1869    int m;
1870    z->marker = RJPEG__MARKER_NONE; /* initialize cached marker to empty */
1871    m = rjpeg__get_marker(z);
1872 
1873    /* No SOI. Corrupt JPEG? */
1874    if (m != JPEG_MARKER_SOI)
1875       return 0;
1876 
1877    if (scan == RJPEG_SCAN_TYPE)
1878       return 1;
1879 
1880    m = rjpeg__get_marker(z);
1881    while (!rjpeg__SOF(m))
1882    {
1883       if (!rjpeg__process_marker(z,m))
1884          return 0;
1885       m = rjpeg__get_marker(z);
1886       while (m == RJPEG__MARKER_NONE)
1887       {
1888          /* some files have extra padding after their blocks, so ok, we'll scan */
1889 
1890          /* No SOF. Corrupt JPEG? */
1891          if (RJPEG__AT_EOF(z->s))
1892             return 0;
1893 
1894          m = rjpeg__get_marker(z);
1895       }
1896    }
1897    z->progressive = rjpeg__SOF_progressive(m);
1898    if (!rjpeg__process_frame_header(z, scan))
1899       return 0;
1900    return 1;
1901 }
1902 
1903 /* decode image to YCbCr format */
rjpeg__decode_jpeg_image(rjpeg__jpeg * j)1904 static int rjpeg__decode_jpeg_image(rjpeg__jpeg *j)
1905 {
1906    int m;
1907    for (m = 0; m < 4; m++)
1908    {
1909       j->img_comp[m].raw_data = NULL;
1910       j->img_comp[m].raw_coeff = NULL;
1911    }
1912    j->restart_interval = 0;
1913    if (!rjpeg__decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1914       return 0;
1915    m = rjpeg__get_marker(j);
1916 
1917    while (m != JPEG_MARKER_EOI)
1918    {
1919       if (m == JPEG_MARKER_SOS)
1920       {
1921          if (!rjpeg__process_scan_header(j))
1922             return 0;
1923          if (!rjpeg__parse_entropy_coded_data(j))
1924             return 0;
1925 
1926          if (j->marker == RJPEG__MARKER_NONE )
1927          {
1928             /* handle 0s at the end of image data from IP Kamera 9060 */
1929 
1930             while (!RJPEG__AT_EOF(j->s))
1931             {
1932                int x = rjpeg__get8(j->s);
1933                if (x == 255)
1934                {
1935                   j->marker = rjpeg__get8(j->s);
1936                   break;
1937                }
1938                else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1939                   return 0;
1940             }
1941 
1942             /* if we reach eof without hitting a marker,
1943              * rjpeg__get_marker() below will fail and we'll eventually return 0 */
1944          }
1945       }
1946       else
1947       {
1948          if (!rjpeg__process_marker(j, m))
1949             return 0;
1950       }
1951       m = rjpeg__get_marker(j);
1952    }
1953 
1954    if (j->progressive)
1955       rjpeg__jpeg_finish(j);
1956    return 1;
1957 }
1958 
1959 /* static jfif-centered resampling (across block boundaries) */
1960 
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1961 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1962       uint8_t *in_far, int w, int hs)
1963 {
1964    (void)out;
1965    (void)in_far;
1966    (void)w;
1967    (void)hs;
1968    return in_near;
1969 }
1970 
rjpeg__resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1971 static uint8_t* rjpeg__resample_row_v_2(uint8_t *out, uint8_t *in_near,
1972       uint8_t *in_far, int w, int hs)
1973 {
1974    /* need to generate two samples vertically for every one in input */
1975    int i;
1976    (void)hs;
1977    for (i=0; i < w; ++i)
1978       out[i] = rjpeg__div4(3*in_near[i] + in_far[i] + 2);
1979    return out;
1980 }
1981 
rjpeg__resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1982 static uint8_t*  rjpeg__resample_row_h_2(uint8_t *out, uint8_t *in_near,
1983       uint8_t *in_far, int w, int hs)
1984 {
1985    /* need to generate two samples horizontally for every one in input */
1986    int i;
1987    uint8_t *input = in_near;
1988 
1989    if (w == 1)
1990    {
1991       /* if only one sample, can't do any interpolation */
1992       out[0] = out[1] = input[0];
1993       return out;
1994    }
1995 
1996    out[0] = input[0];
1997    out[1] = rjpeg__div4(input[0]*3 + input[1] + 2);
1998 
1999    for (i=1; i < w-1; ++i)
2000    {
2001       int n = 3*input[i]+2;
2002       out[i*2+0] = rjpeg__div4(n+input[i-1]);
2003       out[i*2+1] = rjpeg__div4(n+input[i+1]);
2004    }
2005    out[i*2+0] = rjpeg__div4(input[w-2]*3 + input[w-1] + 2);
2006    out[i*2+1] = input[w-1];
2007 
2008    (void)in_far;
2009    (void)hs;
2010 
2011    return out;
2012 }
2013 
rjpeg__resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2014 static uint8_t *rjpeg__resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2015       uint8_t *in_far, int w, int hs)
2016 {
2017    /* need to generate 2x2 samples for every one in input */
2018    int i,t0,t1;
2019    if (w == 1)
2020    {
2021       out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2022       return out;
2023    }
2024 
2025    t1     = 3*in_near[0] + in_far[0];
2026    out[0] = rjpeg__div4(t1+2);
2027    for (i=1; i < w; ++i)
2028    {
2029       t0 = t1;
2030       t1 = 3*in_near[i]+in_far[i];
2031       out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2032       out[i*2  ] = rjpeg__div16(3*t1 + t0 + 8);
2033    }
2034    out[w*2-1] = rjpeg__div4(t1+2);
2035 
2036    (void)hs;
2037 
2038    return out;
2039 }
2040 
2041 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2042 static uint8_t *rjpeg__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2043       uint8_t *in_far, int w, int hs)
2044 {
2045    /* need to generate 2x2 samples for every one in input */
2046    int i=0,t0,t1;
2047 
2048    if (w == 1)
2049    {
2050       out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2051       return out;
2052    }
2053 
2054    t1 = 3*in_near[0] + in_far[0];
2055    /* process groups of 8 pixels for as long as we can.
2056     * note we can't handle the last pixel in a row in this loop
2057     * because we need to handle the filter boundary conditions.
2058     */
2059    for (; i < ((w-1) & ~7); i += 8)
2060    {
2061 #if defined(__SSE2__)
2062       /* load and perform the vertical filtering pass
2063        * this uses 3*x + y = 4*x + (y - x) */
2064       __m128i zero  = _mm_setzero_si128();
2065       __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
2066       __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2067       __m128i farw  = _mm_unpacklo_epi8(farb, zero);
2068       __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2069       __m128i diff  = _mm_sub_epi16(farw, nearw);
2070       __m128i nears = _mm_slli_epi16(nearw, 2);
2071       __m128i curr  = _mm_add_epi16(nears, diff); /* current row */
2072 
2073       /* horizontal filter works the same based on shifted vers of current
2074        * row. "prev" is current row shifted right by 1 pixel; we need to
2075        * insert the previous pixel value (from t1).
2076        * "next" is current row shifted left by 1 pixel, with first pixel
2077        * of next block of 8 pixels added in.
2078        */
2079       __m128i prv0 = _mm_slli_si128(curr, 2);
2080       __m128i nxt0 = _mm_srli_si128(curr, 2);
2081       __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2082       __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2083 
2084       /* horizontal filter, polyphase implementation since it's convenient:
2085        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2086        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2087        * note the shared term. */
2088       __m128i bias = _mm_set1_epi16(8);
2089       __m128i curs = _mm_slli_epi16(curr, 2);
2090       __m128i prvd = _mm_sub_epi16(prev, curr);
2091       __m128i nxtd = _mm_sub_epi16(next, curr);
2092       __m128i curb = _mm_add_epi16(curs, bias);
2093       __m128i even = _mm_add_epi16(prvd, curb);
2094       __m128i odd  = _mm_add_epi16(nxtd, curb);
2095 
2096       /* interleave even and odd pixels, then undo scaling. */
2097       __m128i int0 = _mm_unpacklo_epi16(even, odd);
2098       __m128i int1 = _mm_unpackhi_epi16(even, odd);
2099       __m128i de0  = _mm_srli_epi16(int0, 4);
2100       __m128i de1  = _mm_srli_epi16(int1, 4);
2101 
2102       /* pack and write output */
2103       __m128i outv = _mm_packus_epi16(de0, de1);
2104       _mm_storeu_si128((__m128i *) (out + i*2), outv);
2105 #elif defined(RJPEG_NEON)
2106       /* load and perform the vertical filtering pass
2107        * this uses 3*x + y = 4*x + (y - x) */
2108       uint8x8_t farb  = vld1_u8(in_far + i);
2109       uint8x8_t nearb = vld1_u8(in_near + i);
2110       int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2111       int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2112       int16x8_t curr  = vaddq_s16(nears, diff); /* current row */
2113 
2114       /* horizontal filter works the same based on shifted vers of current
2115        * row. "prev" is current row shifted right by 1 pixel; we need to
2116        * insert the previous pixel value (from t1).
2117        * "next" is current row shifted left by 1 pixel, with first pixel
2118        * of next block of 8 pixels added in. */
2119       int16x8_t prv0 = vextq_s16(curr, curr, 7);
2120       int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2121       int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2122       int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2123 
2124       /* horizontal filter, polyphase implementation since it's convenient:
2125        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2126        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2127        * note the shared term.
2128        */
2129       int16x8_t curs = vshlq_n_s16(curr, 2);
2130       int16x8_t prvd = vsubq_s16(prev, curr);
2131       int16x8_t nxtd = vsubq_s16(next, curr);
2132       int16x8_t even = vaddq_s16(curs, prvd);
2133       int16x8_t odd  = vaddq_s16(curs, nxtd);
2134 
2135       /* undo scaling and round, then store with even/odd phases interleaved */
2136       uint8x8x2_t o;
2137       o.val[0] = vqrshrun_n_s16(even, 4);
2138       o.val[1] = vqrshrun_n_s16(odd,  4);
2139       vst2_u8(out + i*2, o);
2140 #endif
2141 
2142       /* "previous" value for next iteration */
2143       t1 = 3*in_near[i+7] + in_far[i+7];
2144    }
2145 
2146    t0       = t1;
2147    t1       = 3*in_near[i] + in_far[i];
2148    out[i*2] = rjpeg__div16(3*t1 + t0 + 8);
2149 
2150    for (++i; i < w; ++i)
2151    {
2152       t0         = t1;
2153       t1         = 3*in_near[i]+in_far[i];
2154       out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2155       out[i*2  ] = rjpeg__div16(3*t1 + t0 + 8);
2156    }
2157    out[w*2-1] = rjpeg__div4(t1+2);
2158 
2159    (void)hs;
2160 
2161    return out;
2162 }
2163 #endif
2164 
rjpeg__resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2165 static uint8_t *rjpeg__resample_row_generic(uint8_t *out,
2166       uint8_t *in_near, uint8_t *in_far, int w, int hs)
2167 {
2168    /* resample with nearest-neighbor */
2169    int i,j;
2170    (void)in_far;
2171 
2172    for (i=0; i < w; ++i)
2173       for (j=0; j < hs; ++j)
2174          out[i*hs+j] = in_near[i];
2175    return out;
2176 }
2177 
2178 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2179  * to make sure the code produces the same results in both SIMD and scalar */
2180 #ifndef float2fixed
2181 #define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
2182 #endif
2183 
rjpeg__YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2184 static void rjpeg__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2185       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2186 {
2187    int i;
2188    for (i=0; i < count; ++i)
2189    {
2190       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2191       int cr = pcr[i] - 128;
2192       int cb = pcb[i] - 128;
2193       int r = y_fixed +  cr* float2fixed(1.40200f);
2194       int g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2195       int b = y_fixed                               +   cb* float2fixed(1.77200f);
2196       r >>= 20;
2197       g >>= 20;
2198       b >>= 20;
2199       if ((unsigned) r > 255)
2200          r = 255;
2201       if ((unsigned) g > 255)
2202          g = 255;
2203       if ((unsigned) b > 255)
2204          b = 255;
2205       out[0] = (uint8_t)r;
2206       out[1] = (uint8_t)g;
2207       out[2] = (uint8_t)b;
2208       out[3] = 255;
2209       out += step;
2210    }
2211 }
2212 
2213 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2214 static void rjpeg__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2215       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2216 {
2217    int i = 0;
2218 
2219 #if defined(__SSE2__)
2220    /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2221     * it's useful in practice (you wouldn't use it for textures, for example).
2222     * so just accelerate step == 4 case.
2223     */
2224    if (step == 4)
2225    {
2226       /* this is a fairly straightforward implementation and not super-optimized. */
2227       __m128i signflip  = _mm_set1_epi8(-0x80);
2228       __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
2229       __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2230       __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2231       __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
2232       __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2233       __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2234 
2235       for (; i+7 < count; i += 8)
2236       {
2237          /* load */
2238          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2239          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2240          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2241          __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2242          __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2243 
2244          /* unpack to short (and left-shift cr, cb by 8) */
2245          __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
2246          __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2247          __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2248 
2249          /* color transform */
2250          __m128i yws = _mm_srli_epi16(yw, 4);
2251          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2252          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2253          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2254          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2255          __m128i rws = _mm_add_epi16(cr0, yws);
2256          __m128i gwt = _mm_add_epi16(cb0, yws);
2257          __m128i bws = _mm_add_epi16(yws, cb1);
2258          __m128i gws = _mm_add_epi16(gwt, cr1);
2259 
2260          /* descale */
2261          __m128i rw = _mm_srai_epi16(rws, 4);
2262          __m128i bw = _mm_srai_epi16(bws, 4);
2263          __m128i gw = _mm_srai_epi16(gws, 4);
2264 
2265          /* back to byte, set up for transpose */
2266          __m128i brb = _mm_packus_epi16(rw, bw);
2267          __m128i gxb = _mm_packus_epi16(gw, xw);
2268 
2269          /* transpose to interleave channels */
2270          __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2271          __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2272          __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2273          __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2274 
2275          /* store */
2276          _mm_storeu_si128((__m128i *) (out + 0), o0);
2277          _mm_storeu_si128((__m128i *) (out + 16), o1);
2278          out += 32;
2279       }
2280    }
2281 #endif
2282 
2283 #ifdef RJPEG_NEON
2284    /* in this version, step=3 support would be easy to add. but is there demand? */
2285    if (step == 4)
2286    {
2287       /* this is a fairly straightforward implementation and not super-optimized. */
2288       uint8x8_t signflip = vdup_n_u8(0x80);
2289       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
2290       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2291       int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2292       int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
2293 
2294       for (; i+7 < count; i += 8)
2295       {
2296          uint8x8x4_t o;
2297 
2298          /* load */
2299          uint8x8_t y_bytes  = vld1_u8(y + i);
2300          uint8x8_t cr_bytes = vld1_u8(pcr + i);
2301          uint8x8_t cb_bytes = vld1_u8(pcb + i);
2302          int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2303          int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2304 
2305          /* expand to s16 */
2306          int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2307          int16x8_t crw = vshll_n_s8(cr_biased, 7);
2308          int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2309 
2310          /* color transform */
2311          int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2312          int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2313          int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2314          int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2315          int16x8_t rws = vaddq_s16(yws, cr0);
2316          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2317          int16x8_t bws = vaddq_s16(yws, cb1);
2318 
2319          /* undo scaling, round, convert to byte */
2320          o.val[0] = vqrshrun_n_s16(rws, 4);
2321          o.val[1] = vqrshrun_n_s16(gws, 4);
2322          o.val[2] = vqrshrun_n_s16(bws, 4);
2323          o.val[3] = vdup_n_u8(255);
2324 
2325          /* store, interleaving r/g/b/a */
2326          vst4_u8(out, o);
2327          out += 8*4;
2328       }
2329    }
2330 #endif
2331 
2332    for (; i < count; ++i)
2333    {
2334       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2335       int cr      = pcr[i] - 128;
2336       int cb      = pcb[i] - 128;
2337       int r       = y_fixed + cr* float2fixed(1.40200f);
2338       int g       = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2339       int b       = y_fixed                             +   cb* float2fixed(1.77200f);
2340       r >>= 20;
2341       g >>= 20;
2342       b >>= 20;
2343       if ((unsigned) r > 255)
2344          r = 255;
2345       if ((unsigned) g > 255)
2346          g = 255;
2347       if ((unsigned) b > 255)
2348          b = 255;
2349       out[0] = (uint8_t)r;
2350       out[1] = (uint8_t)g;
2351       out[2] = (uint8_t)b;
2352       out[3] = 255;
2353       out += step;
2354    }
2355 }
2356 #endif
2357 
2358 /* set up the kernels */
rjpeg__setup_jpeg(rjpeg__jpeg * j)2359 static void rjpeg__setup_jpeg(rjpeg__jpeg *j)
2360 {
2361    uint64_t mask = cpu_features_get();
2362 
2363    (void)mask;
2364 
2365    j->idct_block_kernel        = rjpeg__idct_block;
2366    j->YCbCr_to_RGB_kernel      = rjpeg__YCbCr_to_RGB_row;
2367    j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2;
2368 
2369 #if defined(__SSE2__)
2370    if (mask & RETRO_SIMD_SSE2)
2371    {
2372       j->idct_block_kernel        = rjpeg__idct_simd;
2373       j->YCbCr_to_RGB_kernel      = rjpeg__YCbCr_to_RGB_simd;
2374       j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2375    }
2376 #endif
2377 
2378 #ifdef RJPEG_NEON
2379    j->idct_block_kernel           = rjpeg__idct_simd;
2380    j->YCbCr_to_RGB_kernel         = rjpeg__YCbCr_to_RGB_simd;
2381    j->resample_row_hv_2_kernel    = rjpeg__resample_row_hv_2_simd;
2382 #endif
2383 }
2384 
2385 /* clean up the temporary component buffers */
rjpeg__cleanup_jpeg(rjpeg__jpeg * j)2386 static void rjpeg__cleanup_jpeg(rjpeg__jpeg *j)
2387 {
2388    int i;
2389    for (i=0; i < j->s->img_n; ++i)
2390    {
2391       if (j->img_comp[i].raw_data)
2392       {
2393          free(j->img_comp[i].raw_data);
2394          j->img_comp[i].raw_data = NULL;
2395          j->img_comp[i].data = NULL;
2396       }
2397 
2398       if (j->img_comp[i].raw_coeff)
2399       {
2400          free(j->img_comp[i].raw_coeff);
2401          j->img_comp[i].raw_coeff = 0;
2402          j->img_comp[i].coeff = 0;
2403       }
2404 
2405       if (j->img_comp[i].linebuf)
2406       {
2407          free(j->img_comp[i].linebuf);
2408          j->img_comp[i].linebuf = NULL;
2409       }
2410    }
2411 }
2412 
rjpeg_load_jpeg_image(rjpeg__jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2413 static uint8_t *rjpeg_load_jpeg_image(rjpeg__jpeg *z,
2414       unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2415 {
2416    int n, decode_n;
2417    int k;
2418    unsigned int i,j;
2419    rjpeg__resample res_comp[4];
2420    uint8_t *coutput[4] = {0};
2421    uint8_t *output     = NULL;
2422    z->s->img_n         = 0;
2423 
2424    /* load a jpeg image from whichever source, but leave in YCbCr format */
2425    if (!rjpeg__decode_jpeg_image(z))
2426       goto error;
2427 
2428    /* determine actual number of components to generate */
2429    n = req_comp ? req_comp : z->s->img_n;
2430 
2431    if (z->s->img_n == 3 && n < 3)
2432       decode_n = 1;
2433    else
2434       decode_n = z->s->img_n;
2435 
2436    /* resample and color-convert */
2437    for (k=0; k < decode_n; ++k)
2438    {
2439       rjpeg__resample *r = &res_comp[k];
2440 
2441       /* allocate line buffer big enough for upsampling off the edges
2442        * with upsample factor of 4 */
2443       z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2444       if (!z->img_comp[k].linebuf)
2445          goto error;
2446 
2447       r->hs       = z->img_h_max / z->img_comp[k].h;
2448       r->vs       = z->img_v_max / z->img_comp[k].v;
2449       r->ystep    = r->vs >> 1;
2450       r->w_lores  = (z->s->img_x + r->hs-1) / r->hs;
2451       r->ypos     = 0;
2452       r->line0    = r->line1 = z->img_comp[k].data;
2453       r->resample = rjpeg__resample_row_generic;
2454 
2455       if      (r->hs == 1 && r->vs == 1)
2456          r->resample = rjpeg_resample_row_1;
2457       else if (r->hs == 1 && r->vs == 2)
2458          r->resample = rjpeg__resample_row_v_2;
2459       else if (r->hs == 2 && r->vs == 1)
2460          r->resample = rjpeg__resample_row_h_2;
2461       else if (r->hs == 2 && r->vs == 2)
2462          r->resample = z->resample_row_hv_2_kernel;
2463    }
2464 
2465    /* can't error after this so, this is safe */
2466    output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2467 
2468    if (!output)
2469       goto error;
2470 
2471    /* now go ahead and resample */
2472    for (j=0; j < z->s->img_y; ++j)
2473    {
2474       uint8_t *out = output + n * z->s->img_x * j;
2475       for (k=0; k < decode_n; ++k)
2476       {
2477          rjpeg__resample *r = &res_comp[k];
2478          int         y_bot  = r->ystep >= (r->vs >> 1);
2479 
2480          coutput[k]         = r->resample(z->img_comp[k].linebuf,
2481                y_bot ? r->line1 : r->line0,
2482                y_bot ? r->line0 : r->line1,
2483                r->w_lores, r->hs);
2484 
2485          if (++r->ystep >= r->vs)
2486          {
2487             r->ystep = 0;
2488             r->line0 = r->line1;
2489             if (++r->ypos < z->img_comp[k].y)
2490                r->line1 += z->img_comp[k].w2;
2491          }
2492       }
2493 
2494       if (n >= 3)
2495       {
2496          uint8_t *y = coutput[0];
2497          if (y)
2498          {
2499             if (z->s->img_n == 3)
2500                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2501             else
2502                for (i=0; i < z->s->img_x; ++i)
2503                {
2504                   out[0] = out[1] = out[2] = y[i];
2505                   out[3] = 255; /* not used if n==3 */
2506                   out += n;
2507                }
2508          }
2509       }
2510       else
2511       {
2512          uint8_t *y = coutput[0];
2513          if (n == 1)
2514             for (i=0; i < z->s->img_x; ++i)
2515                out[i] = y[i];
2516          else
2517             for (i=0; i < z->s->img_x; ++i)
2518             {
2519                *out++ = y[i];
2520                *out++ = 255;
2521             }
2522       }
2523    }
2524 
2525    rjpeg__cleanup_jpeg(z);
2526    *out_x = z->s->img_x;
2527    *out_y = z->s->img_y;
2528 
2529    if (comp)
2530       *comp  = z->s->img_n; /* report original components, not output */
2531    return output;
2532 
2533 error:
2534    rjpeg__cleanup_jpeg(z);
2535    return NULL;
2536 }
2537 
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2538 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2539       size_t size, unsigned *width, unsigned *height)
2540 {
2541    rjpeg__jpeg j;
2542    rjpeg__context s;
2543    int comp;
2544    uint32_t *img         = NULL;
2545    uint32_t *pixels      = NULL;
2546    unsigned size_tex     = 0;
2547 
2548    if (!rjpeg)
2549       return IMAGE_PROCESS_ERROR;
2550 
2551    s.img_buffer          = (uint8_t*)rjpeg->buff_data;
2552    s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2553    s.img_buffer_end      = (uint8_t*)rjpeg->buff_data + (int)size;
2554 
2555    j.s                   = &s;
2556 
2557    rjpeg__setup_jpeg(&j);
2558 
2559    img                   =  (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2560 
2561    if (!img)
2562       return IMAGE_PROCESS_ERROR;
2563 
2564    size_tex = (*width) * (*height);
2565    pixels   = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2566 
2567    if (!pixels)
2568    {
2569       free(img);
2570       return IMAGE_PROCESS_ERROR;
2571    }
2572 
2573    *buf_data = pixels;
2574 
2575    /* Convert RGBA to ARGB */
2576    while (size_tex--)
2577    {
2578       unsigned int texel = img[size_tex];
2579       unsigned int A     = texel & 0xFF000000;
2580       unsigned int B     = texel & 0x00FF0000;
2581       unsigned int G     = texel & 0x0000FF00;
2582       unsigned int R     = texel & 0x000000FF;
2583       ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2584    }
2585 
2586    free(img);
2587 
2588    return IMAGE_PROCESS_END;
2589 }
2590 
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2591 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2592 {
2593    if (!rjpeg)
2594       return false;
2595 
2596    rjpeg->buff_data = (uint8_t*)data;
2597 
2598    return true;
2599 }
2600 
rjpeg_free(rjpeg_t * rjpeg)2601 void rjpeg_free(rjpeg_t *rjpeg)
2602 {
2603    if (!rjpeg)
2604       return;
2605 
2606    free(rjpeg);
2607 }
2608 
rjpeg_alloc(void)2609 rjpeg_t *rjpeg_alloc(void)
2610 {
2611    rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2612    if (!rjpeg)
2613       return NULL;
2614    return rjpeg;
2615 }
2616