1 /* Copyright  (C) 2010-2020 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (rjpeg.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /* Modified version of stb_image's JPEG sources. */
24 
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37 
38 enum
39 {
40    RJPEG_DEFAULT = 0, /* only used for req_comp */
41    RJPEG_GREY,
42    RJPEG_GREY_ALPHA,
43    RJPEG_RGB,
44    RJPEG_RGB_ALPHA
45 };
46 
47 enum
48 {
49    RJPEG_SCAN_LOAD = 0,
50    RJPEG_SCAN_TYPE,
51    RJPEG_SCAN_HEADER
52 };
53 
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55                                     int w, int hs);
56 
57 typedef struct
58 {
59    rjpeg_resample_row_func resample;
60    uint8_t *line0;
61    uint8_t *line1;
62    int hs,vs;   /* expansion factor in each axis */
63    int w_lores; /* horizontal pixels pre-expansion */
64    int ystep;   /* how far through vertical expansion we are */
65    int ypos;    /* which pre-expansion row we're on */
66 } rjpeg_resample;
67 
68 struct rjpeg
69 {
70    uint8_t *buff_data;
71 };
72 
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76 
77 #ifdef RJPEG_HAS_LROTL
78    #define RJPEG_LROT(x,y)  _lrotl(x,y)
79 #else
80    #define RJPEG_LROT(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82 
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG_X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG_X86_TARGET
88 #endif
89 
90 #if defined(__GNUC__) && (defined(RJPEG_X86_TARGET) || defined(RJPEG_X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92  * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93  * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94  * this is just broken and gcc are jerks for not fixing it properly
95  * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96  */
97 #define RJPEG_NO_SIMD
98 #endif
99 
100 #if defined(__MINGW32__) && defined(RJPEG_X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG_X64_TARGET
102  *
103  * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104  * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105  * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106  * simultaneously enabling "-mstackrealign".
107  *
108  * See https://github.com/nothings/stb/issues/81 for more information.
109  *
110  * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111  * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112  */
113 #define RJPEG_NO_SIMD
114 #endif
115 
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118 
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124 
125 #endif
126 
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131 
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137 
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141 
142 typedef struct
143 {
144    uint8_t *img_buffer;
145    uint8_t *img_buffer_end;
146    uint8_t *img_buffer_original;
147    int      img_n;
148    int      img_out_n;
149    int      buflen;
150    uint32_t img_x;
151    uint32_t img_y;
152    uint8_t  buffer_start[128];
153 } rjpeg_context;
154 
rjpeg_get8(rjpeg_context * s)155 static INLINE uint8_t rjpeg_get8(rjpeg_context *s)
156 {
157    if (s->img_buffer < s->img_buffer_end)
158       return *s->img_buffer++;
159 
160    return 0;
161 }
162 
163 #define RJPEG_AT_EOF(s)     ((s)->img_buffer >= (s)->img_buffer_end)
164 
165 #define RJPEG_GET16BE(s)    ((rjpeg_get8((s)) << 8) + rjpeg_get8((s)))
166 
167 /* huffman decoding acceleration */
168 #define FAST_BITS   9  /* larger handles more cases; smaller stomps less cache */
169 
170 typedef struct
171 {
172    unsigned int maxcode[18];
173    int    delta[17];   /* old 'firstsymbol' - old 'firstcode' */
174    /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
175    uint16_t code[256];
176    uint8_t  fast[1 << FAST_BITS];
177    uint8_t  values[256];
178    uint8_t  size[257];
179 } rjpeg_huffman;
180 
181 typedef struct
182 {
183    rjpeg_context *s;
184    /* kernels */
185    void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
186    void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
187          const uint8_t *pcr, int count, int step);
188    uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
189          uint8_t *in_far, int w, int hs);
190 
191    /* definition of jpeg image component */
192    struct
193    {
194       uint8_t *data;
195       void *raw_data, *raw_coeff;
196       uint8_t *linebuf;
197       short   *coeff;            /* progressive only */
198       int id;
199       int h,v;
200       int tq;
201       int hd,ha;
202       int dc_pred;
203 
204       int x,y,w2,h2;
205       int      coeff_w;          /* number of 8x8 coefficient blocks */
206       int      coeff_h;          /* number of 8x8 coefficient blocks */
207    } img_comp[4];
208 
209    /* sizes for components, interleaved MCUs */
210    int img_h_max, img_v_max;
211    int img_mcu_x, img_mcu_y;
212    int img_mcu_w, img_mcu_h;
213 
214    int            code_bits;     /* number of valid bits */
215    int            nomore;        /* flag if we saw a marker so must stop */
216    int            progressive;
217    int            spec_start;
218    int            spec_end;
219    int            succ_high;
220    int            succ_low;
221    int            eob_run;
222    int scan_n, order[4];
223    int restart_interval, todo;
224    uint32_t       code_buffer;   /* jpeg entropy-coded buffer */
225    rjpeg_huffman huff_dc[4];     /* unsigned int alignment */
226    rjpeg_huffman huff_ac[4];     /* unsigned int alignment */
227    int16_t fast_ac[4][1 << FAST_BITS];
228    unsigned char  marker;        /* marker seen while filling entropy buffer */
229    uint8_t dequant[4][64];
230 } rjpeg_jpeg;
231 
232 #define RJPEG_F2F(x)  ((int) (((x) * 4096 + 0.5)))
233 #define RJPEG_FSH(x)  ((x) << 12)
234 
235 #define RJPEG_MARKER_NONE  0xff
236 /* if there's a pending marker from the entropy stream, return that
237  * otherwise, fetch from the stream and get a marker. if there's no
238  * marker, return 0xff, which is never a valid marker value
239  */
240 
241 /* in each scan, we'll have scan_n components, and the order
242  * of the components is specified by order[]
243  */
244 #define RJPEG_RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
245 
246 #define JPEG_MARKER           0xFF
247 #define JPEG_MARKER_SOI       0xD8
248 #define JPEG_MARKER_SOS       0xDA
249 #define JPEG_MARKER_EOI       0xD9
250 #define JPEG_MARKER_APP1      0xE1
251 #define JPEG_MARKER_APP2      0xE2
252 
253 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
254 #define RJPEG_SOF(x)               ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
255 
256 #define RJPEG_SOF_PROGRESSIVE(x)   ((x) == 0xc2)
257 #define RJPEG_DIV4(x)              ((uint8_t) ((x) >> 2))
258 #define RJPEG_DIV16(x)             ((uint8_t) ((x) >> 4))
259 
rjpeg_build_huffman(rjpeg_huffman * h,int * count)260 static int rjpeg_build_huffman(rjpeg_huffman *h, int *count)
261 {
262    int i,j,k = 0,code;
263 
264    /* build size list for each symbol (from JPEG spec) */
265    for (i = 0; i < 16; ++i)
266       for (j = 0; j < count[i]; ++j)
267          h->size[k++] = (uint8_t) (i+1);
268 
269    h->size[k] = 0;
270    /* compute actual symbols (from jpeg spec) */
271    code       = 0;
272    k          = 0;
273 
274    for (j = 1; j <= 16; ++j)
275    {
276       /* compute delta to add to code to compute symbol id */
277       h->delta[j] = k - code;
278       if (h->size[k] == j)
279       {
280          while (h->size[k] == j)
281             h->code[k++] = (uint16_t) (code++);
282 
283          /* Bad code lengths, corrupt JPEG? */
284          if (code-1 >= (1 << j))
285             return 0;
286       }
287       /* compute largest code + 1 for this size, preshifted as needed later */
288       h->maxcode[j] = code << (16-j);
289       code <<= 1;
290    }
291    h->maxcode[j] = 0xffffffff;
292 
293    /* build non-spec acceleration table; 255 is flag for not-accelerated */
294    memset(h->fast, 255, 1 << FAST_BITS);
295    for (i = 0; i < k; ++i)
296    {
297       int s = h->size[i];
298       if (s <= FAST_BITS)
299       {
300          int c = h->code[i] << (FAST_BITS-s);
301          int m = 1 << (FAST_BITS-s);
302          for (j = 0; j < m; ++j)
303             h->fast[c+j] = (uint8_t) i;
304       }
305    }
306    return 1;
307 }
308 
309 /* build a table that decodes both magnitude and value of small ACs in
310  * one go. */
rjpeg_build_fast_ac(int16_t * fast_ac,rjpeg_huffman * h)311 static void rjpeg_build_fast_ac(int16_t *fast_ac, rjpeg_huffman *h)
312 {
313    int i;
314 
315    for (i = 0; i < (1 << FAST_BITS); ++i)
316    {
317       uint8_t fast = h->fast[i];
318 
319       fast_ac[i] = 0;
320 
321       if (fast < 255)
322       {
323          int rs      = h->values[fast];
324          int run     = (rs >> 4) & 15;
325          int magbits = rs & 15;
326          int len     = h->size[fast];
327 
328          if (magbits && len + magbits <= FAST_BITS)
329          {
330             /* magnitude code followed by receive_extend code */
331             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
332             int m = 1 << (magbits - 1);
333             if (k < m)
334                k += (-1 << magbits) + 1;
335 
336             /* if the result is small enough, we can fit it in fast_ac table */
337             if (k >= -128 && k <= 127)
338                fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
339          }
340       }
341    }
342 }
343 
rjpeg_grow_buffer_unsafe(rjpeg_jpeg * j)344 static void rjpeg_grow_buffer_unsafe(rjpeg_jpeg *j)
345 {
346    do
347    {
348       int b = j->nomore ? 0 : rjpeg_get8(j->s);
349       if (b == 0xff)
350       {
351          int c = rjpeg_get8(j->s);
352 
353          if (c != 0)
354          {
355             j->marker = (unsigned char) c;
356             j->nomore = 1;
357             return;
358          }
359       }
360       j->code_buffer |= b << (24 - j->code_bits);
361       j->code_bits   += 8;
362    } while (j->code_bits <= 24);
363 }
364 
365 /* (1 << n) - 1 */
366 static uint32_t rjpeg_bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
367 
368 /* decode a JPEG huffman value from the bitstream */
rjpeg_jpeg_huff_decode(rjpeg_jpeg * j,rjpeg_huffman * h)369 static INLINE int rjpeg_jpeg_huff_decode(rjpeg_jpeg *j, rjpeg_huffman *h)
370 {
371    unsigned int temp;
372    int c,k;
373 
374    if (j->code_bits < 16)
375       rjpeg_grow_buffer_unsafe(j);
376 
377    /* look at the top FAST_BITS and determine what symbol ID it is,
378     * if the code is <= FAST_BITS */
379    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
380    k = h->fast[c];
381 
382    if (k < 255)
383    {
384       int s = h->size[k];
385       if (s > j->code_bits)
386          return -1;
387       j->code_buffer <<= s;
388       j->code_bits -= s;
389       return h->values[k];
390    }
391 
392    /* naive test is to shift the code_buffer down so k bits are
393     * valid, then test against maxcode. To speed this up, we've
394     * preshifted maxcode left so that it has (16-k) 0s at the
395     * end; in other words, regardless of the number of bits, it
396     * wants to be compared against something shifted to have 16;
397     * that way we don't need to shift inside the loop. */
398    temp = j->code_buffer >> 16;
399    for (k=FAST_BITS+1 ; ; ++k)
400       if (temp < h->maxcode[k])
401          break;
402 
403    if (k == 17)
404    {
405       /* error! code not found */
406       j->code_bits -= 16;
407       return -1;
408    }
409 
410    if (k > j->code_bits)
411       return -1;
412 
413    /* convert the huffman code to the symbol id */
414    c = ((j->code_buffer >> (32 - k)) & rjpeg_bmask[k]) + h->delta[k];
415    retro_assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg_bmask[h->size[c]]) == h->code[c]);
416 
417    /* convert the id to a symbol */
418    j->code_bits -= k;
419    j->code_buffer <<= k;
420    return h->values[c];
421 }
422 
423 /* bias[n] = (-1<<n) + 1 */
424 static int const rjpeg_jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
425 
426 /* combined JPEG 'receive' and JPEG 'extend', since baseline
427  * always extends everything it receives. */
rjpeg_extend_receive(rjpeg_jpeg * j,int n)428 static INLINE int rjpeg_extend_receive(rjpeg_jpeg *j, int n)
429 {
430    unsigned int k;
431    int sgn;
432    if (j->code_bits < n)
433       rjpeg_grow_buffer_unsafe(j);
434 
435    sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
436    k = RJPEG_LROT(j->code_buffer, n);
437    retro_assert(n >= 0 && n < (int) (sizeof(rjpeg_bmask)/sizeof(*rjpeg_bmask)));
438    j->code_buffer  = k & ~rjpeg_bmask[n];
439    k              &= rjpeg_bmask[n];
440    j->code_bits   -= n;
441    return k + (rjpeg_jbias[n] & ~sgn);
442 }
443 
444 /* get some unsigned bits */
rjpeg_jpeg_get_bits(rjpeg_jpeg * j,int n)445 static INLINE int rjpeg_jpeg_get_bits(rjpeg_jpeg *j, int n)
446 {
447    unsigned int k;
448    if (j->code_bits < n)
449       rjpeg_grow_buffer_unsafe(j);
450    k              = RJPEG_LROT(j->code_buffer, n);
451    j->code_buffer = k & ~rjpeg_bmask[n];
452    k             &= rjpeg_bmask[n];
453    j->code_bits  -= n;
454    return k;
455 }
456 
rjpeg_jpeg_get_bit(rjpeg_jpeg * j)457 static INLINE int rjpeg_jpeg_get_bit(rjpeg_jpeg *j)
458 {
459    unsigned int k;
460    if (j->code_bits < 1)
461       rjpeg_grow_buffer_unsafe(j);
462 
463    k                = j->code_buffer;
464    j->code_buffer <<= 1;
465    --j->code_bits;
466    return k & 0x80000000;
467 }
468 
469 /* given a value that's at position X in the zigzag stream,
470  * where does it appear in the 8x8 matrix coded as row-major? */
471 static uint8_t rjpeg_jpeg_dezigzag[64+15] =
472 {
473     0,  1,  8, 16,  9,  2,  3, 10,
474    17, 24, 32, 25, 18, 11,  4,  5,
475    12, 19, 26, 33, 40, 48, 41, 34,
476    27, 20, 13,  6,  7, 14, 21, 28,
477    35, 42, 49, 56, 57, 50, 43, 36,
478    29, 22, 15, 23, 30, 37, 44, 51,
479    58, 59, 52, 45, 38, 31, 39, 46,
480    53, 60, 61, 54, 47, 55, 62, 63,
481    /* let corrupt input sample past end */
482    63, 63, 63, 63, 63, 63, 63, 63,
483    63, 63, 63, 63, 63, 63, 63
484 };
485 
486 /* decode one 64-entry block-- */
rjpeg_jpeg_decode_block(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hdc,rjpeg_huffman * hac,int16_t * fac,int b,uint8_t * dequant)487 static int rjpeg_jpeg_decode_block(
488       rjpeg_jpeg *j, short data[64],
489       rjpeg_huffman *hdc,
490       rjpeg_huffman *hac,
491       int16_t *fac,
492       int b,
493       uint8_t *dequant)
494 {
495    int dc,k;
496    int t;
497    int diff      = 0;
498 
499    if (j->code_bits < 16)
500       rjpeg_grow_buffer_unsafe(j);
501    t = rjpeg_jpeg_huff_decode(j, hdc);
502 
503    /* Bad huffman code. Corrupt JPEG? */
504    if (t < 0)
505       return 0;
506 
507    /* 0 all the ac values now so we can do it 32-bits at a time */
508    memset(data,0,64*sizeof(data[0]));
509 
510    if (t)
511       diff                = rjpeg_extend_receive(j, t);
512    dc                     = j->img_comp[b].dc_pred + diff;
513    j->img_comp[b].dc_pred = dc;
514    data[0]                = (short) (dc * dequant[0]);
515 
516    /* decode AC components, see JPEG spec */
517    k                      = 1;
518    do
519    {
520       unsigned int zig;
521       int c,r,s;
522       if (j->code_bits < 16)
523          rjpeg_grow_buffer_unsafe(j);
524       c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
525       r = fac[c];
526       if (r)
527       {
528          /* fast-AC path */
529          k               += (r >> 4) & 15; /* run */
530          s                = r & 15; /* combined length */
531          j->code_buffer <<= s;
532          j->code_bits    -= s;
533          /* decode into unzigzag'd location */
534          zig              = rjpeg_jpeg_dezigzag[k++];
535          data[zig]        = (short) ((r >> 8) * dequant[zig]);
536       }
537       else
538       {
539          int rs = rjpeg_jpeg_huff_decode(j, hac);
540 
541          /* Bad huffman code. Corrupt JPEG? */
542          if (rs < 0)
543             return 0;
544 
545          s = rs & 15;
546          r = rs >> 4;
547          if (s == 0)
548          {
549             if (rs != 0xf0)
550                break; /* end block */
551             k += 16;
552          }
553          else
554          {
555             k += r;
556             /* decode into unzigzag'd location */
557             zig = rjpeg_jpeg_dezigzag[k++];
558             data[zig] = (short) (rjpeg_extend_receive(j,s) * dequant[zig]);
559          }
560       }
561    } while (k < 64);
562    return 1;
563 }
564 
rjpeg_jpeg_decode_block_prog_dc(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hdc,int b)565 static int rjpeg_jpeg_decode_block_prog_dc(
566       rjpeg_jpeg *j,
567       short data[64],
568       rjpeg_huffman *hdc,
569       int b)
570 {
571    /* Can't merge DC and AC. Corrupt JPEG? */
572    if (j->spec_end != 0)
573       return 0;
574 
575    if (j->code_bits < 16)
576       rjpeg_grow_buffer_unsafe(j);
577 
578    if (j->succ_high == 0)
579    {
580       int t;
581       int dc;
582       int diff = 0;
583 
584       /* first scan for DC coefficient, must be first */
585       memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
586       t       = rjpeg_jpeg_huff_decode(j, hdc);
587       if (t)
588          diff = rjpeg_extend_receive(j, t);
589 
590       dc      = j->img_comp[b].dc_pred + diff;
591       j->img_comp[b].dc_pred = dc;
592       data[0] = (short) (dc << j->succ_low);
593    }
594    else
595    {
596       /* refinement scan for DC coefficient */
597       if (rjpeg_jpeg_get_bit(j))
598          data[0] += (short) (1 << j->succ_low);
599    }
600    return 1;
601 }
602 
rjpeg_jpeg_decode_block_prog_ac(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hac,int16_t * fac)603 static int rjpeg_jpeg_decode_block_prog_ac(
604       rjpeg_jpeg *j,
605       short data[64],
606       rjpeg_huffman *hac,
607       int16_t *fac)
608 {
609    int k;
610 
611    /* Can't merge DC and AC. Corrupt JPEG? */
612    if (j->spec_start == 0)
613       return 0;
614 
615    if (j->succ_high == 0)
616    {
617       int shift = j->succ_low;
618 
619       if (j->eob_run)
620       {
621          --j->eob_run;
622          return 1;
623       }
624 
625       k = j->spec_start;
626       do
627       {
628          unsigned int zig;
629          int c,r,s;
630          if (j->code_bits < 16)
631             rjpeg_grow_buffer_unsafe(j);
632          c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
633          r = fac[c];
634          if (r)
635          {
636             /* fast-AC path */
637             k               += (r >> 4) & 15; /* run */
638             s                = r & 15; /* combined length */
639             j->code_buffer <<= s;
640             j->code_bits    -= s;
641             zig              = rjpeg_jpeg_dezigzag[k++];
642             data[zig]        = (short) ((r >> 8) << shift);
643          }
644          else
645          {
646             int rs = rjpeg_jpeg_huff_decode(j, hac);
647 
648             /* Bad huffman code. Corrupt JPEG? */
649             if (rs < 0)
650                return 0;
651 
652             s = rs & 15;
653             r = rs >> 4;
654             if (s == 0)
655             {
656                if (r < 15)
657                {
658                   j->eob_run = (1 << r);
659                   if (r)
660                      j->eob_run += rjpeg_jpeg_get_bits(j, r);
661                   --j->eob_run;
662                   break;
663                }
664                k += 16;
665             }
666             else
667             {
668                k         += r;
669                zig        = rjpeg_jpeg_dezigzag[k++];
670                data[zig]  = (short) (rjpeg_extend_receive(j,s) << shift);
671             }
672          }
673       } while (k <= j->spec_end);
674    }
675    else
676    {
677       /* refinement scan for these AC coefficients */
678 
679       short bit = (short) (1 << j->succ_low);
680 
681       if (j->eob_run)
682       {
683          --j->eob_run;
684          for (k = j->spec_start; k <= j->spec_end; ++k)
685          {
686             short *p = &data[rjpeg_jpeg_dezigzag[k]];
687             if (*p != 0)
688                if (rjpeg_jpeg_get_bit(j))
689                   if ((*p & bit) == 0)
690                   {
691                      if (*p > 0)
692                         *p += bit;
693                      else
694                         *p -= bit;
695                   }
696          }
697       }
698       else
699       {
700          k = j->spec_start;
701          do
702          {
703             int r,s;
704             int rs = rjpeg_jpeg_huff_decode(j, hac);
705 
706             /* Bad huffman code. Corrupt JPEG? */
707             if (rs < 0)
708                return 0;
709 
710             s = rs & 15;
711             r = rs >> 4;
712             if (s == 0)
713             {
714                if (r < 15)
715                {
716                   j->eob_run = (1 << r) - 1;
717                   if (r)
718                      j->eob_run += rjpeg_jpeg_get_bits(j, r);
719                   r = 64; /* force end of block */
720                }
721                else
722                {
723                   /* r=15 s=0 should write 16 0s, so we just do
724                    * a run of 15 0s and then write s (which is 0),
725                    * so we don't have to do anything special here */
726                }
727             }
728             else
729             {
730                /* Bad huffman code. Corrupt JPEG? */
731                if (s != 1)
732                   return 0;
733 
734                /* sign bit */
735                if (rjpeg_jpeg_get_bit(j))
736                   s = bit;
737                else
738                   s = -bit;
739             }
740 
741             /* advance by r */
742             while (k <= j->spec_end)
743             {
744                short *p = &data[rjpeg_jpeg_dezigzag[k++]];
745                if (*p != 0)
746                {
747                   if (rjpeg_jpeg_get_bit(j))
748                      if ((*p & bit) == 0)
749                      {
750                         if (*p > 0)
751                            *p += bit;
752                         else
753                            *p -= bit;
754                      }
755                }
756                else
757                {
758                   if (r == 0)
759                   {
760                      *p = (short) s;
761                      break;
762                   }
763                   --r;
764                }
765             }
766          } while (k <= j->spec_end);
767       }
768    }
769    return 1;
770 }
771 
772 /* take a -128..127 value and rjpeg_clamp it and convert to 0..255 */
rjpeg_clamp(int x)773 static INLINE uint8_t rjpeg_clamp(int x)
774 {
775    /* trick to use a single test to catch both cases */
776    if ((unsigned int) x > 255)
777       return 255;
778    return (uint8_t) x;
779 }
780 
781 /* derived from jidctint -- DCT_ISLOW */
782 #define RJPEG_IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
783    int t0,t1,p4,p5,x0,x1,x2,x3; \
784    int p2 = s2;                                \
785    int p3 = s6;                                \
786    int p1 = (p2+p3) * RJPEG_F2F(0.5411961f);   \
787    int t2 = p1 + p3 * RJPEG_F2F(-1.847759065f);\
788    int t3 = p1 + p2 * RJPEG_F2F( 0.765366865f);\
789    p2 = s0;                                    \
790    p3 = s4;                                    \
791    t0 = RJPEG_FSH(p2+p3);                      \
792    t1 = RJPEG_FSH(p2-p3);                      \
793    x0 = t0+t3;                                 \
794    x3 = t0-t3;                                 \
795    x1 = t1+t2;                                 \
796    x2 = t1-t2;                                 \
797    t0 = s7;                                    \
798    t1 = s5;                                    \
799    t2 = s3;                                    \
800    t3 = s1;                                    \
801    p3 = t0+t2;                                 \
802    p4 = t1+t3;                                 \
803    p1 = t0+t3;                                 \
804    p2 = t1+t2;                                 \
805    p5 = (p3+p4) * RJPEG_F2F( 1.175875602f);    \
806    t0 = t0      * RJPEG_F2F( 0.298631336f);    \
807    t1 = t1      * RJPEG_F2F( 2.053119869f);    \
808    t2 = t2      * RJPEG_F2F( 3.072711026f);    \
809    t3 = t3      * RJPEG_F2F( 1.501321110f);    \
810    p1 = p5 + p1 * RJPEG_F2F(-0.899976223f);    \
811    p2 = p5 + p2 * RJPEG_F2F(-2.562915447f);    \
812    p3 = p3      * RJPEG_F2F(-1.961570560f);    \
813    p4 = p4      * RJPEG_F2F(-0.390180644f);    \
814    t3 += p1+p4;                                \
815    t2 += p2+p3;                                \
816    t1 += p2+p4;                                \
817    t0 += p1+p3
818 
rjpeg_idct_block(uint8_t * out,int out_stride,short data[64])819 static void rjpeg_idct_block(uint8_t *out, int out_stride, short data[64])
820 {
821    int i,val[64],*v=val;
822    uint8_t   *o = NULL;
823    int16_t   *d = data;
824 
825    /* columns */
826    for (i = 0; i < 8; ++i,++d, ++v)
827    {
828       /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
829       if (     d[ 8] == 0
830             && d[16] == 0
831             && d[24] == 0
832             && d[32] == 0
833             && d[40] == 0
834             && d[48] == 0
835             && d[56] == 0)
836       {
837          /*    no shortcut                 0     seconds
838           *    (1|2|3|4|5|6|7)==0          0     seconds
839           *    all separate               -0.047 seconds
840           *    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds */
841          int dcterm = d[0] << 2;
842          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
843       }
844       else
845       {
846          RJPEG_IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]);
847 
848          /* constants scaled things up by 1<<12; let's bring them back
849           * down, but keep 2 extra bits of precision */
850          x0 += 512;
851          x1 += 512;
852          x2 += 512;
853          x3 += 512;
854 
855          v[ 0] = (x0+t3) >> 10;
856          v[56] = (x0-t3) >> 10;
857          v[ 8] = (x1+t2) >> 10;
858          v[48] = (x1-t2) >> 10;
859          v[16] = (x2+t1) >> 10;
860          v[40] = (x2-t1) >> 10;
861          v[24] = (x3+t0) >> 10;
862          v[32] = (x3-t0) >> 10;
863       }
864    }
865 
866    for (i = 0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
867    {
868       /* no fast case since the first 1D IDCT spread components out */
869       RJPEG_IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]);
870 
871       /* constants scaled things up by 1<<12, plus we had 1<<2 from first
872        * loop, plus horizontal and vertical each scale by sqrt(8) so together
873        * we've got an extra 1<<3, so 1<<17 total we need to remove.
874        * so we want to round that, which means adding 0.5 * 1<<17,
875        * aka 65536. Also, we'll end up with -128 to 127 that we want
876        * to encode as 0..255 by adding 128, so we'll add that before the shift
877        */
878       x0 += 65536 + (128<<17);
879       x1 += 65536 + (128<<17);
880       x2 += 65536 + (128<<17);
881       x3 += 65536 + (128<<17);
882 
883       /* Tried computing the shifts into temps, or'ing the temps to see
884        * if any were out of range, but that was slower */
885       o[0] = rjpeg_clamp((x0+t3) >> 17);
886       o[7] = rjpeg_clamp((x0-t3) >> 17);
887       o[1] = rjpeg_clamp((x1+t2) >> 17);
888       o[6] = rjpeg_clamp((x1-t2) >> 17);
889       o[2] = rjpeg_clamp((x2+t1) >> 17);
890       o[5] = rjpeg_clamp((x2-t1) >> 17);
891       o[3] = rjpeg_clamp((x3+t0) >> 17);
892       o[4] = rjpeg_clamp((x3-t0) >> 17);
893    }
894 }
895 
896 #if defined(__SSE2__)
897 /* sse2 integer IDCT. not the fastest possible implementation but it
898  * produces bit-identical results to the generic C version so it's
899  * fully "transparent".
900  */
rjpeg_idct_simd(uint8_t * out,int out_stride,short data[64])901 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
902 {
903    /* This is constructed to match our regular (generic) integer IDCT exactly. */
904    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
905    __m128i tmp;
906 
907    /* dot product constant: even elems=x, odd elems=y */
908    #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
909 
910    /* out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
911     * out(1) = c1[even]*x + c1[odd]*y
912     */
913    #define dct_rot(out0,out1, x,y,c0,c1) \
914       __m128i c0##lo   = _mm_unpacklo_epi16((x),(y)); \
915       __m128i c0##hi   = _mm_unpackhi_epi16((x),(y)); \
916       __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
917       __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
918       __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
919       __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
920 
921    /* out = in << 12  (in 16-bit, out 32-bit) */
922    #define dct_widen(out, in) \
923       __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
924       __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
925 
926    /* wide add */
927    #define dct_wadd(out, a, b) \
928       __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
929       __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
930 
931    /* wide sub */
932    #define dct_wsub(out, a, b) \
933       __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
934       __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
935 
936    /* butterfly a/b, add bias, then shift by "s" and pack */
937    #define dct_bfly32o(out0, out1, a,b,bias,s) \
938       { \
939          __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
940          __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
941          dct_wadd(sum, abiased, b); \
942          dct_wsub(dif, abiased, b); \
943          out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
944          out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
945       }
946 
947    /* 8-bit interleave step (for transposes) */
948    #define dct_interleave8(a, b) \
949       tmp = a; \
950       a = _mm_unpacklo_epi8(a, b); \
951       b = _mm_unpackhi_epi8(tmp, b)
952 
953    /* 16-bit interleave step (for transposes) */
954    #define dct_interleave16(a, b) \
955       tmp = a; \
956       a = _mm_unpacklo_epi16(a, b); \
957       b = _mm_unpackhi_epi16(tmp, b)
958 
959    #define dct_pass(bias,shift) \
960       { \
961          /* even part */ \
962          dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
963          __m128i sum04 = _mm_add_epi16(row0, row4); \
964          __m128i dif04 = _mm_sub_epi16(row0, row4); \
965          dct_widen(t0e, sum04); \
966          dct_widen(t1e, dif04); \
967          dct_wadd(x0, t0e, t3e); \
968          dct_wsub(x3, t0e, t3e); \
969          dct_wadd(x1, t1e, t2e); \
970          dct_wsub(x2, t1e, t2e); \
971          /* odd part */ \
972          dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
973          dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
974          __m128i sum17 = _mm_add_epi16(row1, row7); \
975          __m128i sum35 = _mm_add_epi16(row3, row5); \
976          dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
977          dct_wadd(x4, y0o, y4o); \
978          dct_wadd(x5, y1o, y5o); \
979          dct_wadd(x6, y2o, y5o); \
980          dct_wadd(x7, y3o, y4o); \
981          dct_bfly32o(row0,row7, x0,x7,bias,shift); \
982          dct_bfly32o(row1,row6, x1,x6,bias,shift); \
983          dct_bfly32o(row2,row5, x2,x5,bias,shift); \
984          dct_bfly32o(row3,row4, x3,x4,bias,shift); \
985       }
986 
987    __m128i rot0_0 = dct_const(RJPEG_F2F(0.5411961f), RJPEG_F2F(0.5411961f) + RJPEG_F2F(-1.847759065f));
988    __m128i rot0_1 = dct_const(RJPEG_F2F(0.5411961f) + RJPEG_F2F( 0.765366865f), RJPEG_F2F(0.5411961f));
989    __m128i rot1_0 = dct_const(RJPEG_F2F(1.175875602f) + RJPEG_F2F(-0.899976223f), RJPEG_F2F(1.175875602f));
990    __m128i rot1_1 = dct_const(RJPEG_F2F(1.175875602f), RJPEG_F2F(1.175875602f) + RJPEG_F2F(-2.562915447f));
991    __m128i rot2_0 = dct_const(RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 0.298631336f), RJPEG_F2F(-1.961570560f));
992    __m128i rot2_1 = dct_const(RJPEG_F2F(-1.961570560f), RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 3.072711026f));
993    __m128i rot3_0 = dct_const(RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 2.053119869f), RJPEG_F2F(-0.390180644f));
994    __m128i rot3_1 = dct_const(RJPEG_F2F(-0.390180644f), RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 1.501321110f));
995 
996    /* rounding biases in column/row passes, see rjpeg_idct_block for explanation. */
997    __m128i bias_0 = _mm_set1_epi32(512);
998    __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
999 
1000    /* load */
1001    row0 = _mm_load_si128((const __m128i *) (data + 0*8));
1002    row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1003    row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1004    row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1005    row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1006    row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1007    row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1008    row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1009 
1010    /* column pass */
1011    dct_pass(bias_0, 10);
1012 
1013    {
1014       /* 16bit 8x8 transpose pass 1 */
1015       dct_interleave16(row0, row4);
1016       dct_interleave16(row1, row5);
1017       dct_interleave16(row2, row6);
1018       dct_interleave16(row3, row7);
1019 
1020       /* transpose pass 2 */
1021       dct_interleave16(row0, row2);
1022       dct_interleave16(row1, row3);
1023       dct_interleave16(row4, row6);
1024       dct_interleave16(row5, row7);
1025 
1026       /* transpose pass 3 */
1027       dct_interleave16(row0, row1);
1028       dct_interleave16(row2, row3);
1029       dct_interleave16(row4, row5);
1030       dct_interleave16(row6, row7);
1031    }
1032 
1033    /* row pass */
1034    dct_pass(bias_1, 17);
1035 
1036    {
1037       /* pack */
1038       __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1039       __m128i p1 = _mm_packus_epi16(row2, row3);
1040       __m128i p2 = _mm_packus_epi16(row4, row5);
1041       __m128i p3 = _mm_packus_epi16(row6, row7);
1042 
1043       /* 8bit 8x8 transpose pass 1 */
1044       dct_interleave8(p0, p2); /* a0e0a1e1... */
1045       dct_interleave8(p1, p3); /* c0g0c1g1... */
1046 
1047       /* transpose pass 2 */
1048       dct_interleave8(p0, p1); /* a0c0e0g0... */
1049       dct_interleave8(p2, p3); /* b0d0f0h0... */
1050 
1051       /* transpose pass 3 */
1052       dct_interleave8(p0, p2); /* a0b0c0d0... */
1053       dct_interleave8(p1, p3); /* a4b4c4d4... */
1054 
1055       /* store */
1056       _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1057       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1058       _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1059       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1060       _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1061       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1062       _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1063       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1064    }
1065 
1066 #undef dct_const
1067 #undef dct_rot
1068 #undef dct_widen
1069 #undef dct_wadd
1070 #undef dct_wsub
1071 #undef dct_bfly32o
1072 #undef dct_interleave8
1073 #undef dct_interleave16
1074 #undef dct_pass
1075 }
1076 
1077 #endif
1078 
1079 #ifdef RJPEG_NEON
1080 
1081 /* NEON integer IDCT. should produce bit-identical
1082  * results to the generic C version. */
rjpeg_idct_simd(uint8_t * out,int out_stride,short data[64])1083 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
1084 {
1085    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1086 
1087    int16x4_t rot0_0 = vdup_n_s16(RJPEG_F2F(0.5411961f));
1088    int16x4_t rot0_1 = vdup_n_s16(RJPEG_F2F(-1.847759065f));
1089    int16x4_t rot0_2 = vdup_n_s16(RJPEG_F2F( 0.765366865f));
1090    int16x4_t rot1_0 = vdup_n_s16(RJPEG_F2F( 1.175875602f));
1091    int16x4_t rot1_1 = vdup_n_s16(RJPEG_F2F(-0.899976223f));
1092    int16x4_t rot1_2 = vdup_n_s16(RJPEG_F2F(-2.562915447f));
1093    int16x4_t rot2_0 = vdup_n_s16(RJPEG_F2F(-1.961570560f));
1094    int16x4_t rot2_1 = vdup_n_s16(RJPEG_F2F(-0.390180644f));
1095    int16x4_t rot3_0 = vdup_n_s16(RJPEG_F2F( 0.298631336f));
1096    int16x4_t rot3_1 = vdup_n_s16(RJPEG_F2F( 2.053119869f));
1097    int16x4_t rot3_2 = vdup_n_s16(RJPEG_F2F( 3.072711026f));
1098    int16x4_t rot3_3 = vdup_n_s16(RJPEG_F2F( 1.501321110f));
1099 
1100 #define dct_long_mul(out, inq, coeff) \
1101    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1102    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1103 
1104 #define dct_long_mac(out, acc, inq, coeff) \
1105    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1106    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1107 
1108 #define dct_widen(out, inq) \
1109    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1110    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1111 
1112 /* wide add */
1113 #define dct_wadd(out, a, b) \
1114    int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1115    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1116 
1117 /* wide sub */
1118 #define dct_wsub(out, a, b) \
1119    int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1120    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1121 
1122 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1123 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1124    { \
1125       dct_wadd(sum, a, b); \
1126       dct_wsub(dif, a, b); \
1127       out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1128       out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1129    }
1130 
1131 #define dct_pass(shiftop, shift) \
1132    { \
1133       /* even part */ \
1134       int16x8_t sum26 = vaddq_s16(row2, row6); \
1135       dct_long_mul(p1e, sum26, rot0_0); \
1136       dct_long_mac(t2e, p1e, row6, rot0_1); \
1137       dct_long_mac(t3e, p1e, row2, rot0_2); \
1138       int16x8_t sum04 = vaddq_s16(row0, row4); \
1139       int16x8_t dif04 = vsubq_s16(row0, row4); \
1140       dct_widen(t0e, sum04); \
1141       dct_widen(t1e, dif04); \
1142       dct_wadd(x0, t0e, t3e); \
1143       dct_wsub(x3, t0e, t3e); \
1144       dct_wadd(x1, t1e, t2e); \
1145       dct_wsub(x2, t1e, t2e); \
1146       /* odd part */ \
1147       int16x8_t sum15 = vaddq_s16(row1, row5); \
1148       int16x8_t sum17 = vaddq_s16(row1, row7); \
1149       int16x8_t sum35 = vaddq_s16(row3, row5); \
1150       int16x8_t sum37 = vaddq_s16(row3, row7); \
1151       int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1152       dct_long_mul(p5o, sumodd, rot1_0); \
1153       dct_long_mac(p1o, p5o, sum17, rot1_1); \
1154       dct_long_mac(p2o, p5o, sum35, rot1_2); \
1155       dct_long_mul(p3o, sum37, rot2_0); \
1156       dct_long_mul(p4o, sum15, rot2_1); \
1157       dct_wadd(sump13o, p1o, p3o); \
1158       dct_wadd(sump24o, p2o, p4o); \
1159       dct_wadd(sump23o, p2o, p3o); \
1160       dct_wadd(sump14o, p1o, p4o); \
1161       dct_long_mac(x4, sump13o, row7, rot3_0); \
1162       dct_long_mac(x5, sump24o, row5, rot3_1); \
1163       dct_long_mac(x6, sump23o, row3, rot3_2); \
1164       dct_long_mac(x7, sump14o, row1, rot3_3); \
1165       dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1166       dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1167       dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1168       dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1169    }
1170 
1171    /* load */
1172    row0 = vld1q_s16(data + 0*8);
1173    row1 = vld1q_s16(data + 1*8);
1174    row2 = vld1q_s16(data + 2*8);
1175    row3 = vld1q_s16(data + 3*8);
1176    row4 = vld1q_s16(data + 4*8);
1177    row5 = vld1q_s16(data + 5*8);
1178    row6 = vld1q_s16(data + 6*8);
1179    row7 = vld1q_s16(data + 7*8);
1180 
1181    /* add DC bias */
1182    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1183 
1184    /* column pass */
1185    dct_pass(vrshrn_n_s32, 10);
1186 
1187    /* 16bit 8x8 transpose */
1188    {
1189 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1190  * whether compilers actually get this is another story, sadly. */
1191 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1192 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1193 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1194 
1195       /* pass 1 */
1196       dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1197       dct_trn16(row2, row3);
1198       dct_trn16(row4, row5);
1199       dct_trn16(row6, row7);
1200 
1201       /* pass 2 */
1202       dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1203       dct_trn32(row1, row3);
1204       dct_trn32(row4, row6);
1205       dct_trn32(row5, row7);
1206 
1207       /* pass 3 */
1208       dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1209       dct_trn64(row1, row5);
1210       dct_trn64(row2, row6);
1211       dct_trn64(row3, row7);
1212 
1213 #undef dct_trn16
1214 #undef dct_trn32
1215 #undef dct_trn64
1216    }
1217 
1218    /* row pass
1219     * vrshrn_n_s32 only supports shifts up to 16, we need
1220     * 17. so do a non-rounding shift of 16 first then follow
1221     * up with a rounding shift by 1. */
1222    dct_pass(vshrn_n_s32, 16);
1223 
1224    {
1225       /* pack and round */
1226       uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1227       uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1228       uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1229       uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1230       uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1231       uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1232       uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1233       uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1234 
1235       /* again, these can translate into one instruction, but often don't. */
1236 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1237 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1238 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1239 
1240       /* sadly can't use interleaved stores here since we only write
1241        * 8 bytes to each scan line! */
1242 
1243       /* 8x8 8-bit transpose pass 1 */
1244       dct_trn8_8(p0, p1);
1245       dct_trn8_8(p2, p3);
1246       dct_trn8_8(p4, p5);
1247       dct_trn8_8(p6, p7);
1248 
1249       /* pass 2 */
1250       dct_trn8_16(p0, p2);
1251       dct_trn8_16(p1, p3);
1252       dct_trn8_16(p4, p6);
1253       dct_trn8_16(p5, p7);
1254 
1255       /* pass 3 */
1256       dct_trn8_32(p0, p4);
1257       dct_trn8_32(p1, p5);
1258       dct_trn8_32(p2, p6);
1259       dct_trn8_32(p3, p7);
1260 
1261       /* store */
1262       vst1_u8(out, p0);
1263       out += out_stride;
1264       vst1_u8(out, p1);
1265       out += out_stride;
1266       vst1_u8(out, p2);
1267       out += out_stride;
1268       vst1_u8(out, p3);
1269       out += out_stride;
1270       vst1_u8(out, p4);
1271       out += out_stride;
1272       vst1_u8(out, p5);
1273       out += out_stride;
1274       vst1_u8(out, p6);
1275       out += out_stride;
1276       vst1_u8(out, p7);
1277 
1278 #undef dct_trn8_8
1279 #undef dct_trn8_16
1280 #undef dct_trn8_32
1281    }
1282 
1283 #undef dct_long_mul
1284 #undef dct_long_mac
1285 #undef dct_widen
1286 #undef dct_wadd
1287 #undef dct_wsub
1288 #undef dct_bfly32o
1289 #undef dct_pass
1290 }
1291 
1292 #endif /* RJPEG_NEON */
1293 
rjpeg_get_marker(rjpeg_jpeg * j)1294 static uint8_t rjpeg_get_marker(rjpeg_jpeg *j)
1295 {
1296    uint8_t x;
1297 
1298    if (j->marker != RJPEG_MARKER_NONE)
1299    {
1300       x = j->marker;
1301       j->marker = RJPEG_MARKER_NONE;
1302       return x;
1303    }
1304 
1305    x = rjpeg_get8(j->s);
1306    if (x != 0xff)
1307       return RJPEG_MARKER_NONE;
1308    while (x == 0xff)
1309       x = rjpeg_get8(j->s);
1310    return x;
1311 }
1312 
1313 /* after a restart interval, rjpeg_jpeg_reset the entropy decoder and
1314  * the dc prediction
1315  */
rjpeg_jpeg_reset(rjpeg_jpeg * j)1316 static void rjpeg_jpeg_reset(rjpeg_jpeg *j)
1317 {
1318    j->code_bits           = 0;
1319    j->code_buffer         = 0;
1320    j->nomore              = 0;
1321    j->img_comp[0].dc_pred = 0;
1322    j->img_comp[1].dc_pred = 0;
1323    j->img_comp[2].dc_pred = 0;
1324    j->marker              = RJPEG_MARKER_NONE;
1325    j->todo                = j->restart_interval ? j->restart_interval : 0x7fffffff;
1326    j->eob_run             = 0;
1327 
1328    /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1329     * since we don't even allow 1<<30 pixels */
1330 }
1331 
rjpeg_parse_entropy_coded_data(rjpeg_jpeg * z)1332 static int rjpeg_parse_entropy_coded_data(rjpeg_jpeg *z)
1333 {
1334    rjpeg_jpeg_reset(z);
1335 
1336    if (z->scan_n == 1)
1337    {
1338       int i, j;
1339       int n = z->order[0];
1340       int w = (z->img_comp[n].x+7) >> 3;
1341       int h = (z->img_comp[n].y+7) >> 3;
1342 
1343       /* non-interleaved data, we just need to process one block at a time,
1344        * in trivial scanline order
1345        * number of blocks to do just depends on how many actual "pixels" this
1346        * component has, independent of interleaved MCU blocking and such */
1347 
1348       if (z->progressive)
1349       {
1350          for (j = 0; j < h; ++j)
1351          {
1352             for (i = 0; i < w; ++i)
1353             {
1354                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1355 
1356                if (z->spec_start == 0)
1357                {
1358                   if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1359                      return 0;
1360                }
1361                else
1362                {
1363                   int ha = z->img_comp[n].ha;
1364                   if (!rjpeg_jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1365                      return 0;
1366                }
1367 
1368                /* every data block is an MCU, so countdown the restart interval */
1369                if (--z->todo <= 0)
1370                {
1371                   if (z->code_bits < 24)
1372                      rjpeg_grow_buffer_unsafe(z);
1373 
1374                   if (!RJPEG_RESTART(z->marker))
1375                      return 1;
1376                   rjpeg_jpeg_reset(z);
1377                }
1378             }
1379          }
1380       }
1381       else
1382       {
1383          RJPEG_SIMD_ALIGN(short, data[64]);
1384 
1385          for (j = 0; j < h; ++j)
1386          {
1387             for (i = 0; i < w; ++i)
1388             {
1389                int ha = z->img_comp[n].ha;
1390                if (!rjpeg_jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1391                         z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1392                   return 0;
1393 
1394                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1395                      z->img_comp[n].w2, data);
1396 
1397                /* every data block is an MCU, so countdown the restart interval */
1398                if (--z->todo <= 0)
1399                {
1400                   if (z->code_bits < 24)
1401                      rjpeg_grow_buffer_unsafe(z);
1402 
1403                   /* if it's NOT a restart, then just bail,
1404                    * so we get corrupt data rather than no data */
1405                   if (!RJPEG_RESTART(z->marker))
1406                      return 1;
1407                   rjpeg_jpeg_reset(z);
1408                }
1409             }
1410          }
1411       }
1412    }
1413    else
1414    {
1415       /* interleaved */
1416       int i,j,k,x,y;
1417 
1418       if (z->progressive)
1419       {
1420          for (j = 0; j < z->img_mcu_y; ++j)
1421          {
1422             for (i = 0; i < z->img_mcu_x; ++i)
1423             {
1424                /* scan an interleaved MCU... process scan_n components in order */
1425                for (k = 0; k < z->scan_n; ++k)
1426                {
1427                   int n = z->order[k];
1428                   /* scan out an MCU's worth of this component; that's just determined
1429                    * by the basic H and V specified for the component */
1430                   for (y = 0; y < z->img_comp[n].v; ++y)
1431                   {
1432                      for (x = 0; x < z->img_comp[n].h; ++x)
1433                      {
1434                         int      x2 = (i*z->img_comp[n].h + x);
1435                         int      y2 = (j*z->img_comp[n].v + y);
1436                         short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1437                         if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1438                            return 0;
1439                      }
1440                   }
1441                }
1442 
1443                /* after all interleaved components, that's an interleaved MCU,
1444                 * so now count down the restart interval */
1445                if (--z->todo <= 0)
1446                {
1447                   if (z->code_bits < 24)
1448                      rjpeg_grow_buffer_unsafe(z);
1449                   if (!RJPEG_RESTART(z->marker))
1450                      return 1;
1451                   rjpeg_jpeg_reset(z);
1452                }
1453             }
1454          }
1455       }
1456       else
1457       {
1458          RJPEG_SIMD_ALIGN(short, data[64]);
1459 
1460          for (j = 0; j < z->img_mcu_y; ++j)
1461          {
1462             for (i = 0; i < z->img_mcu_x; ++i)
1463             {
1464                /* scan an interleaved MCU... process scan_n components in order */
1465                for (k = 0; k < z->scan_n; ++k)
1466                {
1467                   int n = z->order[k];
1468                   /* scan out an MCU's worth of this component; that's just determined
1469                    * by the basic H and V specified for the component */
1470                   for (y = 0; y < z->img_comp[n].v; ++y)
1471                   {
1472                      for (x = 0; x < z->img_comp[n].h; ++x)
1473                      {
1474                         int x2 = (i*z->img_comp[n].h + x)*8;
1475                         int y2 = (j*z->img_comp[n].v + y)*8;
1476                         int ha = z->img_comp[n].ha;
1477 
1478                         if (!rjpeg_jpeg_decode_block(z, data,
1479                                  z->huff_dc+z->img_comp[n].hd,
1480                                  z->huff_ac+ha, z->fast_ac[ha],
1481                                  n, z->dequant[z->img_comp[n].tq]))
1482                            return 0;
1483 
1484                         z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1485                               z->img_comp[n].w2, data);
1486                      }
1487                   }
1488                }
1489 
1490                /* after all interleaved components, that's an interleaved MCU,
1491                 * so now count down the restart interval */
1492                if (--z->todo <= 0)
1493                {
1494                   if (z->code_bits < 24)
1495                      rjpeg_grow_buffer_unsafe(z);
1496                   if (!RJPEG_RESTART(z->marker))
1497                      return 1;
1498                   rjpeg_jpeg_reset(z);
1499                }
1500             }
1501          }
1502       }
1503    }
1504 
1505    return 1;
1506 }
1507 
rjpeg_jpeg_dequantize(short * data,uint8_t * dequant)1508 static void rjpeg_jpeg_dequantize(short *data, uint8_t *dequant)
1509 {
1510    int i;
1511    for (i = 0; i < 64; ++i)
1512       data[i] *= dequant[i];
1513 }
1514 
rjpeg_jpeg_finish(rjpeg_jpeg * z)1515 static void rjpeg_jpeg_finish(rjpeg_jpeg *z)
1516 {
1517    int i,j,n;
1518 
1519    if (!z->progressive)
1520       return;
1521 
1522    /* dequantize and IDCT the data */
1523    for (n = 0; n < z->s->img_n; ++n)
1524    {
1525       int w = (z->img_comp[n].x+7) >> 3;
1526       int h = (z->img_comp[n].y+7) >> 3;
1527       for (j = 0; j < h; ++j)
1528       {
1529          for (i = 0; i < w; ++i)
1530          {
1531             short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1532             rjpeg_jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1533             z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1534                   z->img_comp[n].w2, data);
1535          }
1536       }
1537    }
1538 }
1539 
rjpeg_process_marker(rjpeg_jpeg * z,int m)1540 static int rjpeg_process_marker(rjpeg_jpeg *z, int m)
1541 {
1542    int L;
1543    switch (m)
1544    {
1545       case RJPEG_MARKER_NONE: /* no marker found */
1546          /* Expected marker. Corrupt JPEG? */
1547          return 0;
1548 
1549       case 0xDD: /* DRI - specify restart interval */
1550 
1551          /* Bad DRI length. Corrupt JPEG? */
1552          if (RJPEG_GET16BE(z->s) != 4)
1553             return 0;
1554 
1555          z->restart_interval = RJPEG_GET16BE(z->s);
1556          return 1;
1557 
1558       case 0xDB: /* DQT - define quantization table */
1559          L = RJPEG_GET16BE(z->s)-2;
1560          while (L > 0)
1561          {
1562             int q = rjpeg_get8(z->s);
1563             int p = q >> 4;
1564             int t = q & 15,i;
1565 
1566             /* Bad DQT type. Corrupt JPEG? */
1567             if (p != 0)
1568                return 0;
1569 
1570             /* Bad DQT table. Corrupt JPEG? */
1571             if (t > 3)
1572                return 0;
1573 
1574             for (i = 0; i < 64; ++i)
1575                z->dequant[t][rjpeg_jpeg_dezigzag[i]] = rjpeg_get8(z->s);
1576             L -= 65;
1577          }
1578          return L == 0;
1579 
1580       case 0xC4: /* DHT - define huffman table */
1581          L = RJPEG_GET16BE(z->s)-2;
1582          while (L > 0)
1583          {
1584             int sizes[16],i,n = 0;
1585             uint8_t *v = NULL;
1586             int q      = rjpeg_get8(z->s);
1587             int tc     = q >> 4;
1588             int th     = q & 15;
1589 
1590             /* Bad DHT header. Corrupt JPEG? */
1591             if (tc > 1 || th > 3)
1592                return 0;
1593 
1594             for (i = 0; i < 16; ++i)
1595             {
1596                sizes[i] = rjpeg_get8(z->s);
1597                n += sizes[i];
1598             }
1599             L -= 17;
1600 
1601             if (tc == 0)
1602             {
1603                if (!rjpeg_build_huffman(z->huff_dc+th, sizes))
1604                   return 0;
1605                v = z->huff_dc[th].values;
1606             }
1607             else
1608             {
1609                if (!rjpeg_build_huffman(z->huff_ac+th, sizes))
1610                   return 0;
1611                v = z->huff_ac[th].values;
1612             }
1613             for (i = 0; i < n; ++i)
1614                v[i] = rjpeg_get8(z->s);
1615             if (tc != 0)
1616                rjpeg_build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1617             L -= n;
1618          }
1619          return L == 0;
1620    }
1621 
1622    /* check for comment block or APP blocks */
1623    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1624    {
1625       int n = RJPEG_GET16BE(z->s)-2;
1626 
1627       if (n < 0)
1628          z->s->img_buffer = z->s->img_buffer_end;
1629       else
1630          z->s->img_buffer += n;
1631 
1632       return 1;
1633    }
1634    return 0;
1635 }
1636 
1637 /* after we see SOS */
rjpeg_process_scan_header(rjpeg_jpeg * z)1638 static int rjpeg_process_scan_header(rjpeg_jpeg *z)
1639 {
1640    int i;
1641    int aa;
1642    int Ls    = RJPEG_GET16BE(z->s);
1643 
1644    z->scan_n = rjpeg_get8(z->s);
1645 
1646    /* Bad SOS component count. Corrupt JPEG? */
1647    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1648       return 0;
1649 
1650    /* Bad SOS length. Corrupt JPEG? */
1651    if (Ls != 6+2*z->scan_n)
1652       return 0;
1653 
1654    for (i = 0; i < z->scan_n; ++i)
1655    {
1656       int which;
1657       int id = rjpeg_get8(z->s);
1658       int q  = rjpeg_get8(z->s);
1659 
1660       for (which = 0; which < z->s->img_n; ++which)
1661          if (z->img_comp[which].id == id)
1662             break;
1663       if (which == z->s->img_n)
1664          return 0; /* no match */
1665 
1666       /* Bad DC huff. Corrupt JPEG? */
1667       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3)
1668          return 0;
1669 
1670       /* Bad AC huff. Corrupt JPEG? */
1671       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3)
1672          return 0;
1673 
1674       z->order[i] = which;
1675    }
1676 
1677    z->spec_start = rjpeg_get8(z->s);
1678    z->spec_end   = rjpeg_get8(z->s); /* should be 63, but might be 0 */
1679    aa            = rjpeg_get8(z->s);
1680    z->succ_high  = (aa >> 4);
1681    z->succ_low   = (aa & 15);
1682 
1683    if (z->progressive)
1684    {
1685       /* Bad SOS. Corrupt JPEG? */
1686       if (  z->spec_start > 63 ||
1687             z->spec_end > 63   ||
1688             z->spec_start > z->spec_end ||
1689             z->succ_high > 13           ||
1690             z->succ_low > 13)
1691          return 0;
1692    }
1693    else
1694    {
1695       /* Bad SOS. Corrupt JPEG? */
1696       if (z->spec_start != 0)
1697          return 0;
1698       if (z->succ_high != 0 || z->succ_low != 0)
1699          return 0;
1700 
1701       z->spec_end = 63;
1702    }
1703 
1704    return 1;
1705 }
1706 
rjpeg_process_frame_header(rjpeg_jpeg * z,int scan)1707 static int rjpeg_process_frame_header(rjpeg_jpeg *z, int scan)
1708 {
1709    rjpeg_context *s = z->s;
1710    int Lf,p,i,q, h_max=1,v_max=1,c;
1711    Lf = RJPEG_GET16BE(s);
1712 
1713    /* JPEG */
1714 
1715    /* Bad SOF len. Corrupt JPEG? */
1716    if (Lf < 11)
1717       return 0;
1718 
1719    p  = rjpeg_get8(s);
1720 
1721    /* JPEG baseline */
1722 
1723    /* Only 8-bit. JPEG format not supported? */
1724    if (p != 8)
1725       return 0;
1726 
1727    s->img_y = RJPEG_GET16BE(s);
1728 
1729    /* Legal, but we don't handle it--but neither does IJG */
1730 
1731    /* No header height, JPEG format not supported? */
1732    if (s->img_y == 0)
1733       return 0;
1734 
1735    s->img_x = RJPEG_GET16BE(s);
1736 
1737    /* No header width. Corrupt JPEG? */
1738    if (s->img_x == 0)
1739       return 0;
1740 
1741    c = rjpeg_get8(s);
1742 
1743    /* JFIF requires */
1744 
1745    /* Bad component count. Corrupt JPEG? */
1746    if (c != 3 && c != 1)
1747       return 0;
1748 
1749    s->img_n = c;
1750 
1751    for (i = 0; i < c; ++i)
1752    {
1753       z->img_comp[i].data = NULL;
1754       z->img_comp[i].linebuf = NULL;
1755    }
1756 
1757    /* Bad SOF length. Corrupt JPEG? */
1758    if (Lf != 8+3*s->img_n)
1759       return 0;
1760 
1761    for (i = 0; i < s->img_n; ++i)
1762    {
1763       z->img_comp[i].id = rjpeg_get8(s);
1764       if (z->img_comp[i].id != i+1)   /* JFIF requires */
1765          if (z->img_comp[i].id != i)  /* some version of jpegtran outputs non-JFIF-compliant files! */
1766             return 0;
1767 
1768       q                = rjpeg_get8(s);
1769       z->img_comp[i].h = (q >> 4);
1770 
1771       /* Bad H. Corrupt JPEG? */
1772       if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1773          return 0;
1774 
1775       z->img_comp[i].v = q & 15;
1776 
1777       /* Bad V. Corrupt JPEG? */
1778       if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1779          return 0;
1780 
1781       z->img_comp[i].tq = rjpeg_get8(s);
1782 
1783       /* Bad TQ. Corrupt JPEG? */
1784       if (z->img_comp[i].tq > 3)
1785          return 0;
1786    }
1787 
1788    if (scan != RJPEG_SCAN_LOAD)
1789       return 1;
1790 
1791    /* Image too large to decode? */
1792    if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1793       return 0;
1794 
1795    for (i = 0; i < s->img_n; ++i)
1796    {
1797       if (z->img_comp[i].h > h_max)
1798          h_max = z->img_comp[i].h;
1799       if (z->img_comp[i].v > v_max)
1800          v_max = z->img_comp[i].v;
1801    }
1802 
1803    /* compute interleaved MCU info */
1804    z->img_h_max = h_max;
1805    z->img_v_max = v_max;
1806    z->img_mcu_w = h_max * 8;
1807    z->img_mcu_h = v_max * 8;
1808    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1809    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1810 
1811    if (z->progressive)
1812    {
1813       for (i = 0; i < s->img_n; ++i)
1814       {
1815          /* number of effective pixels (e.g. for non-interleaved MCU) */
1816          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1817          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1818 
1819          /* to simplify generation, we'll allocate enough memory to decode
1820           * the bogus oversized data from using interleaved MCUs and their
1821           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1822           * discard the extra data until colorspace conversion */
1823          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1824          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1825          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1826 
1827          /* Out of memory? */
1828          if (!z->img_comp[i].raw_data)
1829          {
1830             for (--i; i >= 0; --i)
1831             {
1832                free(z->img_comp[i].raw_data);
1833                z->img_comp[i].data = NULL;
1834             }
1835 
1836             return 0;
1837          }
1838 
1839          /* align blocks for IDCT using MMX/SSE */
1840          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1841          z->img_comp[i].linebuf   = NULL;
1842          z->img_comp[i].coeff_w   = (z->img_comp[i].w2 + 7) >> 3;
1843          z->img_comp[i].coeff_h   = (z->img_comp[i].h2 + 7) >> 3;
1844          z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1845                                     z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1846          z->img_comp[i].coeff     = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1847       }
1848    }
1849    else
1850    {
1851       for (i = 0; i < s->img_n; ++i)
1852       {
1853          /* number of effective pixels (e.g. for non-interleaved MCU) */
1854          z->img_comp[i].x        = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1855          z->img_comp[i].y        = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1856 
1857          /* to simplify generation, we'll allocate enough memory to decode
1858           * the bogus oversized data from using interleaved MCUs and their
1859           * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1860           * discard the extra data until colorspace conversion */
1861          z->img_comp[i].w2       = z->img_mcu_x * z->img_comp[i].h * 8;
1862          z->img_comp[i].h2       = z->img_mcu_y * z->img_comp[i].v * 8;
1863          z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1864 
1865          /* Out of memory? */
1866          if (!z->img_comp[i].raw_data)
1867          {
1868             for (--i; i >= 0; --i)
1869             {
1870                free(z->img_comp[i].raw_data);
1871                z->img_comp[i].data = NULL;
1872             }
1873          }
1874 
1875          /* align blocks for IDCT using MMX/SSE */
1876          z->img_comp[i].data      = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1877          z->img_comp[i].linebuf   = NULL;
1878          z->img_comp[i].coeff     = 0;
1879          z->img_comp[i].raw_coeff = 0;
1880       }
1881    }
1882 
1883    return 1;
1884 }
1885 
rjpeg_decode_jpeg_header(rjpeg_jpeg * z,int scan)1886 static int rjpeg_decode_jpeg_header(rjpeg_jpeg *z, int scan)
1887 {
1888    int m;
1889    z->marker = RJPEG_MARKER_NONE; /* initialize cached marker to empty */
1890    m         = rjpeg_get_marker(z);
1891 
1892    /* No SOI. Corrupt JPEG? */
1893    if (m != JPEG_MARKER_SOI)
1894       return 0;
1895 
1896    if (scan == RJPEG_SCAN_TYPE)
1897       return 1;
1898 
1899    m = rjpeg_get_marker(z);
1900    while (!RJPEG_SOF(m))
1901    {
1902       if (!rjpeg_process_marker(z,m))
1903          return 0;
1904       m = rjpeg_get_marker(z);
1905       while (m == RJPEG_MARKER_NONE)
1906       {
1907          /* some files have extra padding after their blocks, so ok, we'll scan */
1908 
1909          /* No SOF. Corrupt JPEG? */
1910          if (RJPEG_AT_EOF(z->s))
1911             return 0;
1912 
1913          m = rjpeg_get_marker(z);
1914       }
1915    }
1916    z->progressive = RJPEG_SOF_PROGRESSIVE(m);
1917    if (!rjpeg_process_frame_header(z, scan))
1918       return 0;
1919    return 1;
1920 }
1921 
1922 /* decode image to YCbCr format */
rjpeg_decode_jpeg_image(rjpeg_jpeg * j)1923 static int rjpeg_decode_jpeg_image(rjpeg_jpeg *j)
1924 {
1925    int m;
1926    for (m = 0; m < 4; m++)
1927    {
1928       j->img_comp[m].raw_data = NULL;
1929       j->img_comp[m].raw_coeff = NULL;
1930    }
1931    j->restart_interval = 0;
1932    if (!rjpeg_decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1933       return 0;
1934    m = rjpeg_get_marker(j);
1935 
1936    while (m != JPEG_MARKER_EOI)
1937    {
1938       if (m == JPEG_MARKER_SOS)
1939       {
1940          if (!rjpeg_process_scan_header(j))
1941             return 0;
1942          if (!rjpeg_parse_entropy_coded_data(j))
1943             return 0;
1944 
1945          if (j->marker == RJPEG_MARKER_NONE )
1946          {
1947             /* handle 0s at the end of image data from IP Kamera 9060 */
1948 
1949             while (!RJPEG_AT_EOF(j->s))
1950             {
1951                int x = rjpeg_get8(j->s);
1952                if (x == 255)
1953                {
1954                   j->marker = rjpeg_get8(j->s);
1955                   break;
1956                }
1957                else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1958                   return 0;
1959             }
1960 
1961             /* if we reach eof without hitting a marker,
1962              * rjpeg_get_marker() below will fail and we'll eventually return 0 */
1963          }
1964       }
1965       else
1966       {
1967          if (!rjpeg_process_marker(j, m))
1968             return 0;
1969       }
1970       m = rjpeg_get_marker(j);
1971    }
1972 
1973    if (j->progressive)
1974       rjpeg_jpeg_finish(j);
1975    return 1;
1976 }
1977 
1978 /* static jfif-centered resampling (across block boundaries) */
1979 
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1980 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1981       uint8_t *in_far, int w, int hs)
1982 {
1983    (void)out;
1984    (void)in_far;
1985    (void)w;
1986    (void)hs;
1987    return in_near;
1988 }
1989 
rjpeg_resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1990 static uint8_t* rjpeg_resample_row_v_2(uint8_t *out, uint8_t *in_near,
1991       uint8_t *in_far, int w, int hs)
1992 {
1993    /* need to generate two samples vertically for every one in input */
1994    int i;
1995    (void)hs;
1996    for (i = 0; i < w; ++i)
1997       out[i] = RJPEG_DIV4(3*in_near[i] + in_far[i] + 2);
1998    return out;
1999 }
2000 
rjpeg_resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2001 static uint8_t*  rjpeg_resample_row_h_2(uint8_t *out, uint8_t *in_near,
2002       uint8_t *in_far, int w, int hs)
2003 {
2004    /* need to generate two samples horizontally for every one in input */
2005    int i;
2006    uint8_t *input = in_near;
2007 
2008    if (w == 1)
2009    {
2010       /* if only one sample, can't do any interpolation */
2011       out[0] = out[1] = input[0];
2012       return out;
2013    }
2014 
2015    out[0] = input[0];
2016    out[1] = RJPEG_DIV4(input[0]*3 + input[1] + 2);
2017 
2018    for (i=1; i < w-1; ++i)
2019    {
2020       int n      = 3 * input[i] + 2;
2021       out[i*2+0] = RJPEG_DIV4(n+input[i-1]);
2022       out[i*2+1] = RJPEG_DIV4(n+input[i+1]);
2023    }
2024    out[i*2+0] = RJPEG_DIV4(input[w-2]*3 + input[w-1] + 2);
2025    out[i*2+1] = input[w-1];
2026 
2027    (void)in_far;
2028    (void)hs;
2029 
2030    return out;
2031 }
2032 
rjpeg_resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2033 static uint8_t *rjpeg_resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2034       uint8_t *in_far, int w, int hs)
2035 {
2036    /* need to generate 2x2 samples for every one in input */
2037    int i,t0,t1;
2038    if (w == 1)
2039    {
2040       out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2041       return out;
2042    }
2043 
2044    t1     = 3*in_near[0] + in_far[0];
2045    out[0] = RJPEG_DIV4(t1+2);
2046 
2047    for (i = 1; i < w; ++i)
2048    {
2049       t0         = t1;
2050       t1         = 3*in_near[i]+in_far[i];
2051       out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2052       out[i*2  ] = RJPEG_DIV16(3*t1 + t0 + 8);
2053    }
2054    out[w*2-1] = RJPEG_DIV4(t1+2);
2055 
2056    (void)hs;
2057 
2058    return out;
2059 }
2060 
2061 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg_resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2062 static uint8_t *rjpeg_resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2063       uint8_t *in_far, int w, int hs)
2064 {
2065    /* need to generate 2x2 samples for every one in input */
2066    int i = 0,t0,t1;
2067 
2068    if (w == 1)
2069    {
2070       out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2071       return out;
2072    }
2073 
2074    t1 = 3*in_near[0] + in_far[0];
2075    /* process groups of 8 pixels for as long as we can.
2076     * note we can't handle the last pixel in a row in this loop
2077     * because we need to handle the filter boundary conditions.
2078     */
2079    for (; i < ((w-1) & ~7); i += 8)
2080    {
2081 #if defined(__SSE2__)
2082       /* load and perform the vertical filtering pass
2083        * this uses 3*x + y = 4*x + (y - x) */
2084       __m128i zero  = _mm_setzero_si128();
2085       __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
2086       __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2087       __m128i farw  = _mm_unpacklo_epi8(farb, zero);
2088       __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2089       __m128i diff  = _mm_sub_epi16(farw, nearw);
2090       __m128i nears = _mm_slli_epi16(nearw, 2);
2091       __m128i curr  = _mm_add_epi16(nears, diff); /* current row */
2092 
2093       /* horizontal filter works the same based on shifted vers of current
2094        * row. "prev" is current row shifted right by 1 pixel; we need to
2095        * insert the previous pixel value (from t1).
2096        * "next" is current row shifted left by 1 pixel, with first pixel
2097        * of next block of 8 pixels added in.
2098        */
2099       __m128i prv0 = _mm_slli_si128(curr, 2);
2100       __m128i nxt0 = _mm_srli_si128(curr, 2);
2101       __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2102       __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2103 
2104       /* horizontal filter, polyphase implementation since it's convenient:
2105        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2106        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2107        * note the shared term. */
2108       __m128i bias = _mm_set1_epi16(8);
2109       __m128i curs = _mm_slli_epi16(curr, 2);
2110       __m128i prvd = _mm_sub_epi16(prev, curr);
2111       __m128i nxtd = _mm_sub_epi16(next, curr);
2112       __m128i curb = _mm_add_epi16(curs, bias);
2113       __m128i even = _mm_add_epi16(prvd, curb);
2114       __m128i odd  = _mm_add_epi16(nxtd, curb);
2115 
2116       /* interleave even and odd pixels, then undo scaling. */
2117       __m128i int0 = _mm_unpacklo_epi16(even, odd);
2118       __m128i int1 = _mm_unpackhi_epi16(even, odd);
2119       __m128i de0  = _mm_srli_epi16(int0, 4);
2120       __m128i de1  = _mm_srli_epi16(int1, 4);
2121 
2122       /* pack and write output */
2123       __m128i outv = _mm_packus_epi16(de0, de1);
2124       _mm_storeu_si128((__m128i *) (out + i*2), outv);
2125 #elif defined(RJPEG_NEON)
2126       /* load and perform the vertical filtering pass
2127        * this uses 3*x + y = 4*x + (y - x) */
2128       uint8x8_t farb  = vld1_u8(in_far + i);
2129       uint8x8_t nearb = vld1_u8(in_near + i);
2130       int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2131       int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2132       int16x8_t curr  = vaddq_s16(nears, diff); /* current row */
2133 
2134       /* horizontal filter works the same based on shifted vers of current
2135        * row. "prev" is current row shifted right by 1 pixel; we need to
2136        * insert the previous pixel value (from t1).
2137        * "next" is current row shifted left by 1 pixel, with first pixel
2138        * of next block of 8 pixels added in. */
2139       int16x8_t prv0 = vextq_s16(curr, curr, 7);
2140       int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2141       int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2142       int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2143 
2144       /* horizontal filter, polyphase implementation since it's convenient:
2145        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2146        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2147        * note the shared term.
2148        */
2149       int16x8_t curs = vshlq_n_s16(curr, 2);
2150       int16x8_t prvd = vsubq_s16(prev, curr);
2151       int16x8_t nxtd = vsubq_s16(next, curr);
2152       int16x8_t even = vaddq_s16(curs, prvd);
2153       int16x8_t odd  = vaddq_s16(curs, nxtd);
2154 
2155       /* undo scaling and round, then store with even/odd phases interleaved */
2156       uint8x8x2_t o;
2157       o.val[0] = vqrshrun_n_s16(even, 4);
2158       o.val[1] = vqrshrun_n_s16(odd,  4);
2159       vst2_u8(out + i*2, o);
2160 #endif
2161 
2162       /* "previous" value for next iteration */
2163       t1 = 3*in_near[i+7] + in_far[i+7];
2164    }
2165 
2166    t0       = t1;
2167    t1       = 3*in_near[i] + in_far[i];
2168    out[i*2] = RJPEG_DIV16(3*t1 + t0 + 8);
2169 
2170    for (++i; i < w; ++i)
2171    {
2172       t0         = t1;
2173       t1         = 3*in_near[i]+in_far[i];
2174       out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2175       out[i*2  ] = RJPEG_DIV16(3*t1 + t0 + 8);
2176    }
2177    out[w*2-1]    = RJPEG_DIV4(t1+2);
2178 
2179    (void)hs;
2180 
2181    return out;
2182 }
2183 #endif
2184 
rjpeg_resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2185 static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
2186       uint8_t *in_near, uint8_t *in_far, int w, int hs)
2187 {
2188    /* resample with nearest-neighbor */
2189    int i,j;
2190    (void)in_far;
2191 
2192    for (i = 0; i < w; ++i)
2193       for (j = 0; j < hs; ++j)
2194          out[i*hs+j] = in_near[i];
2195    return out;
2196 }
2197 
2198 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2199  * to make sure the code produces the same results in both SIMD and scalar */
2200 #ifndef FLOAT2FIXED
2201 #define FLOAT2FIXED(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
2202 #endif
2203 
rjpeg_YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2204 static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2205       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2206 {
2207    int i;
2208    for (i = 0; i < count; ++i)
2209    {
2210       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2211       int cr = pcr[i] - 128;
2212       int cb = pcb[i] - 128;
2213       int r = y_fixed +  cr* FLOAT2FIXED(1.40200f);
2214       int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2215       int b = y_fixed                               +   cb* FLOAT2FIXED(1.77200f);
2216       r >>= 20;
2217       g >>= 20;
2218       b >>= 20;
2219       if ((unsigned) r > 255)
2220          r = 255;
2221       if ((unsigned) g > 255)
2222          g = 255;
2223       if ((unsigned) b > 255)
2224          b = 255;
2225       out[0] = (uint8_t)r;
2226       out[1] = (uint8_t)g;
2227       out[2] = (uint8_t)b;
2228       out[3] = 255;
2229       out += step;
2230    }
2231 }
2232 
2233 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg_YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2234 static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2235       const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2236 {
2237    int i = 0;
2238 
2239 #if defined(__SSE2__)
2240    /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2241     * it's useful in practice (you wouldn't use it for textures, for example).
2242     * so just accelerate step == 4 case.
2243     */
2244    if (step == 4)
2245    {
2246       /* this is a fairly straightforward implementation and not super-optimized. */
2247       __m128i signflip  = _mm_set1_epi8(-0x80);
2248       __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
2249       __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2250       __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2251       __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
2252       __m128i y_bias    = _mm_set1_epi8((char) (unsigned char) 128);
2253       __m128i xw        = _mm_set1_epi16(255); /* alpha channel */
2254 
2255       for (; i+7 < count; i += 8)
2256       {
2257          /* load */
2258          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2259          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2260          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2261          __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2262          __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2263 
2264          /* unpack to short (and left-shift cr, cb by 8) */
2265          __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
2266          __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2267          __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2268 
2269          /* color transform */
2270          __m128i yws = _mm_srli_epi16(yw, 4);
2271          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2272          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2273          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2274          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2275          __m128i rws = _mm_add_epi16(cr0, yws);
2276          __m128i gwt = _mm_add_epi16(cb0, yws);
2277          __m128i bws = _mm_add_epi16(yws, cb1);
2278          __m128i gws = _mm_add_epi16(gwt, cr1);
2279 
2280          /* descale */
2281          __m128i rw = _mm_srai_epi16(rws, 4);
2282          __m128i bw = _mm_srai_epi16(bws, 4);
2283          __m128i gw = _mm_srai_epi16(gws, 4);
2284 
2285          /* back to byte, set up for transpose */
2286          __m128i brb = _mm_packus_epi16(rw, bw);
2287          __m128i gxb = _mm_packus_epi16(gw, xw);
2288 
2289          /* transpose to interleave channels */
2290          __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2291          __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2292          __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2293          __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2294 
2295          /* store */
2296          _mm_storeu_si128((__m128i *) (out + 0), o0);
2297          _mm_storeu_si128((__m128i *) (out + 16), o1);
2298          out += 32;
2299       }
2300    }
2301 #endif
2302 
2303 #ifdef RJPEG_NEON
2304    /* in this version, step=3 support would be easy to add. but is there demand? */
2305    if (step == 4)
2306    {
2307       /* this is a fairly straightforward implementation and not super-optimized. */
2308       uint8x8_t signflip = vdup_n_u8(0x80);
2309       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
2310       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2311       int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2312       int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
2313 
2314       for (; i+7 < count; i += 8)
2315       {
2316          uint8x8x4_t o;
2317 
2318          /* load */
2319          uint8x8_t y_bytes  = vld1_u8(y + i);
2320          uint8x8_t cr_bytes = vld1_u8(pcr + i);
2321          uint8x8_t cb_bytes = vld1_u8(pcb + i);
2322          int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2323          int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2324 
2325          /* expand to s16 */
2326          int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2327          int16x8_t crw = vshll_n_s8(cr_biased, 7);
2328          int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2329 
2330          /* color transform */
2331          int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2332          int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2333          int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2334          int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2335          int16x8_t rws = vaddq_s16(yws, cr0);
2336          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2337          int16x8_t bws = vaddq_s16(yws, cb1);
2338 
2339          /* undo scaling, round, convert to byte */
2340          o.val[0] = vqrshrun_n_s16(rws, 4);
2341          o.val[1] = vqrshrun_n_s16(gws, 4);
2342          o.val[2] = vqrshrun_n_s16(bws, 4);
2343          o.val[3] = vdup_n_u8(255);
2344 
2345          /* store, interleaving r/g/b/a */
2346          vst4_u8(out, o);
2347          out += 8*4;
2348       }
2349    }
2350 #endif
2351 
2352    for (; i < count; ++i)
2353    {
2354       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2355       int cr      = pcr[i] - 128;
2356       int cb      = pcb[i] - 128;
2357       int r       = y_fixed + cr* FLOAT2FIXED(1.40200f);
2358       int g       = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2359       int b       = y_fixed                             +   cb* FLOAT2FIXED(1.77200f);
2360       r >>= 20;
2361       g >>= 20;
2362       b >>= 20;
2363       if ((unsigned) r > 255)
2364          r = 255;
2365       if ((unsigned) g > 255)
2366          g = 255;
2367       if ((unsigned) b > 255)
2368          b = 255;
2369       out[0] = (uint8_t)r;
2370       out[1] = (uint8_t)g;
2371       out[2] = (uint8_t)b;
2372       out[3] = 255;
2373       out += step;
2374    }
2375 }
2376 #endif
2377 
2378 /* set up the kernels */
rjpeg_setup_jpeg(rjpeg_jpeg * j)2379 static void rjpeg_setup_jpeg(rjpeg_jpeg *j)
2380 {
2381    uint64_t mask = cpu_features_get();
2382 
2383    (void)mask;
2384 
2385    j->idct_block_kernel        = rjpeg_idct_block;
2386    j->YCbCr_to_RGB_kernel      = rjpeg_YCbCr_to_RGB_row;
2387    j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2;
2388 
2389 #if defined(__SSE2__)
2390    if (mask & RETRO_SIMD_SSE2)
2391    {
2392       j->idct_block_kernel        = rjpeg_idct_simd;
2393       j->YCbCr_to_RGB_kernel      = rjpeg_YCbCr_to_RGB_simd;
2394       j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2395    }
2396 #endif
2397 
2398 #ifdef RJPEG_NEON
2399    j->idct_block_kernel           = rjpeg_idct_simd;
2400    j->YCbCr_to_RGB_kernel         = rjpeg_YCbCr_to_RGB_simd;
2401    j->resample_row_hv_2_kernel    = rjpeg_resample_row_hv_2_simd;
2402 #endif
2403 }
2404 
2405 /* clean up the temporary component buffers */
rjpeg_cleanup_jpeg(rjpeg_jpeg * j)2406 static void rjpeg_cleanup_jpeg(rjpeg_jpeg *j)
2407 {
2408    int i;
2409    for (i = 0; i < j->s->img_n; ++i)
2410    {
2411       if (j->img_comp[i].raw_data)
2412       {
2413          free(j->img_comp[i].raw_data);
2414          j->img_comp[i].raw_data = NULL;
2415          j->img_comp[i].data = NULL;
2416       }
2417 
2418       if (j->img_comp[i].raw_coeff)
2419       {
2420          free(j->img_comp[i].raw_coeff);
2421          j->img_comp[i].raw_coeff = 0;
2422          j->img_comp[i].coeff = 0;
2423       }
2424 
2425       if (j->img_comp[i].linebuf)
2426       {
2427          free(j->img_comp[i].linebuf);
2428          j->img_comp[i].linebuf = NULL;
2429       }
2430    }
2431 }
2432 
rjpeg_load_jpeg_image(rjpeg_jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2433 static uint8_t *rjpeg_load_jpeg_image(rjpeg_jpeg *z,
2434       unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2435 {
2436    int n, decode_n;
2437    int k;
2438    unsigned int i,j;
2439    rjpeg_resample res_comp[4];
2440    uint8_t *coutput[4] = {0};
2441    uint8_t *output     = NULL;
2442    z->s->img_n         = 0;
2443 
2444    /* load a jpeg image from whichever source, but leave in YCbCr format */
2445    if (!rjpeg_decode_jpeg_image(z))
2446       goto error;
2447 
2448    /* determine actual number of components to generate */
2449    n = req_comp ? req_comp : z->s->img_n;
2450 
2451    if (z->s->img_n == 3 && n < 3)
2452       decode_n = 1;
2453    else
2454       decode_n = z->s->img_n;
2455 
2456    /* resample and color-convert */
2457    for (k = 0; k < decode_n; ++k)
2458    {
2459       rjpeg_resample *r = &res_comp[k];
2460 
2461       /* allocate line buffer big enough for upsampling off the edges
2462        * with upsample factor of 4 */
2463       z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2464       if (!z->img_comp[k].linebuf)
2465          goto error;
2466 
2467       r->hs       = z->img_h_max / z->img_comp[k].h;
2468       r->vs       = z->img_v_max / z->img_comp[k].v;
2469       r->ystep    = r->vs >> 1;
2470       r->w_lores  = (z->s->img_x + r->hs-1) / r->hs;
2471       r->ypos     = 0;
2472       r->line0    = r->line1 = z->img_comp[k].data;
2473       r->resample = rjpeg_resample_row_generic;
2474 
2475       if      (r->hs == 1 && r->vs == 1)
2476          r->resample = rjpeg_resample_row_1;
2477       else if (r->hs == 1 && r->vs == 2)
2478          r->resample = rjpeg_resample_row_v_2;
2479       else if (r->hs == 2 && r->vs == 1)
2480          r->resample = rjpeg_resample_row_h_2;
2481       else if (r->hs == 2 && r->vs == 2)
2482          r->resample = z->resample_row_hv_2_kernel;
2483    }
2484 
2485    /* can't error after this so, this is safe */
2486    output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2487 
2488    if (!output)
2489       goto error;
2490 
2491    /* now go ahead and resample */
2492    for (j = 0; j < z->s->img_y; ++j)
2493    {
2494       uint8_t *out = output + n * z->s->img_x * j;
2495       for (k = 0; k < decode_n; ++k)
2496       {
2497          rjpeg_resample *r = &res_comp[k];
2498          int         y_bot  = r->ystep >= (r->vs >> 1);
2499 
2500          coutput[k]         = r->resample(z->img_comp[k].linebuf,
2501                y_bot ? r->line1 : r->line0,
2502                y_bot ? r->line0 : r->line1,
2503                r->w_lores, r->hs);
2504 
2505          if (++r->ystep >= r->vs)
2506          {
2507             r->ystep = 0;
2508             r->line0 = r->line1;
2509             if (++r->ypos < z->img_comp[k].y)
2510                r->line1 += z->img_comp[k].w2;
2511          }
2512       }
2513 
2514       if (n >= 3)
2515       {
2516          uint8_t *y = coutput[0];
2517          if (y)
2518          {
2519             if (z->s->img_n == 3)
2520                z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2521             else
2522                for (i = 0; i < z->s->img_x; ++i)
2523                {
2524                   out[0]  = out[1] = out[2] = y[i];
2525                   out[3]  = 255; /* not used if n==3 */
2526                   out    += n;
2527                }
2528          }
2529       }
2530       else
2531       {
2532          uint8_t *y = coutput[0];
2533          if (n == 1)
2534             for (i = 0; i < z->s->img_x; ++i)
2535                out[i] = y[i];
2536          else
2537             for (i = 0; i < z->s->img_x; ++i)
2538             {
2539                *out++ = y[i];
2540                *out++ = 255;
2541             }
2542       }
2543    }
2544 
2545    rjpeg_cleanup_jpeg(z);
2546    *out_x = z->s->img_x;
2547    *out_y = z->s->img_y;
2548 
2549    if (comp)
2550       *comp  = z->s->img_n; /* report original components, not output */
2551    return output;
2552 
2553 error:
2554    rjpeg_cleanup_jpeg(z);
2555    return NULL;
2556 }
2557 
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2558 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2559       size_t size, unsigned *width, unsigned *height)
2560 {
2561    rjpeg_jpeg j;
2562    rjpeg_context s;
2563    int comp;
2564    uint32_t *img         = NULL;
2565    uint32_t *pixels      = NULL;
2566    unsigned size_tex     = 0;
2567 
2568    if (!rjpeg)
2569       return IMAGE_PROCESS_ERROR;
2570 
2571    s.img_buffer          = (uint8_t*)rjpeg->buff_data;
2572    s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2573    s.img_buffer_end      = (uint8_t*)rjpeg->buff_data + (int)size;
2574 
2575    j.s                   = &s;
2576 
2577    rjpeg_setup_jpeg(&j);
2578 
2579    img                   =  (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2580 
2581    if (!img)
2582       return IMAGE_PROCESS_ERROR;
2583 
2584    size_tex = (*width) * (*height);
2585    pixels   = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2586 
2587    if (!pixels)
2588    {
2589       free(img);
2590       return IMAGE_PROCESS_ERROR;
2591    }
2592 
2593    *buf_data = pixels;
2594 
2595    /* Convert RGBA to ARGB */
2596    while (size_tex--)
2597    {
2598       unsigned int texel = img[size_tex];
2599       unsigned int A     = texel & 0xFF000000;
2600       unsigned int B     = texel & 0x00FF0000;
2601       unsigned int G     = texel & 0x0000FF00;
2602       unsigned int R     = texel & 0x000000FF;
2603       ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2604    }
2605 
2606    free(img);
2607 
2608    return IMAGE_PROCESS_END;
2609 }
2610 
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2611 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2612 {
2613    if (!rjpeg)
2614       return false;
2615 
2616    rjpeg->buff_data = (uint8_t*)data;
2617 
2618    return true;
2619 }
2620 
rjpeg_free(rjpeg_t * rjpeg)2621 void rjpeg_free(rjpeg_t *rjpeg)
2622 {
2623    if (!rjpeg)
2624       return;
2625 
2626    free(rjpeg);
2627 }
2628 
rjpeg_alloc(void)2629 rjpeg_t *rjpeg_alloc(void)
2630 {
2631    rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2632    if (!rjpeg)
2633       return NULL;
2634    return rjpeg;
2635 }
2636