1 /* Copyright  (C) 2010-2016 The RetroArch team
2  *
3  * ---------------------------------------------------------------------------------------
4  * The following license statement only applies to this file (rjpeg.c).
5  * ---------------------------------------------------------------------------------------
6  *
7  * Permission is hereby granted, free of charge,
8  * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation the rights to
10  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 /* Modified version of stb_image's JPEG sources. */
24 
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37 
38 enum
39 {
40    RJPEG_DEFAULT = 0, /* only used for req_comp */
41    RJPEG_GREY,
42    RJPEG_GREY_ALPHA,
43    RJPEG_RGB,
44    RJPEG_RGB_ALPHA
45 };
46 
47 typedef struct
48 {
49    int      (*read)  (void *user,char *data,int size);   /* fill 'data' with 'size' bytes.  return number of bytes actually read */
50    void     (*skip)  (void *user,int n);                 /* skip the next 'n' bytes, or 'unget' the last -n bytes if negative */
51    int      (*eof)   (void *user);                       /* returns nonzero if we are at end of file/data */
52 } rjpeg_io_callbacks;
53 
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55                                     int w, int hs);
56 
57 typedef struct
58 {
59    rjpeg_resample_row_func resample;
60    uint8_t *line0,*line1;
61    int hs,vs;   /* expansion factor in each axis */
62    int w_lores; /* horizontal pixels pre-expansion */
63    int ystep;   /* how far through vertical expansion we are */
64    int ypos;    /* which pre-expansion row we're on */
65 } rjpeg__resample;
66 
67 struct rjpeg
68 {
69    uint8_t *buff_data;
70    void *empty;
71 };
72 
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76 
77 #ifdef RJPEG_HAS_LROTL
78    #define rjpeg_lrot(x,y)  _lrotl(x,y)
79 #else
80    #define rjpeg_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82 
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG__X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG__X86_TARGET
88 #endif
89 
90 #if defined(__GNUC__) && (defined(RJPEG__X86_TARGET) || defined(RJPEG__X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92  * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93  * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94  * this is just broken and gcc are jerks for not fixing it properly
95  * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96  */
97 #define RJPEG_NO_SIMD
98 #endif
99 
100 #if defined(__MINGW32__) && defined(RJPEG__X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG__X64_TARGET
102  *
103  * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104  * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105  * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106  * simultaneously enabling "-mstackrealign".
107  *
108  * See https://github.com/nothings/stb/issues/81 for more information.
109  *
110  * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111  * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112  */
113 #define RJPEG_NO_SIMD
114 #endif
115 
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118 
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124 
125 #endif
126 
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131 
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137 
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141 
142 typedef struct
143 {
144    uint32_t img_x, img_y;
145    int img_n, img_out_n;
146 
147    rjpeg_io_callbacks io;
148    void *io_user_data;
149 
150    int read_from_callbacks;
151    int buflen;
152    uint8_t buffer_start[128];
153 
154    uint8_t *img_buffer, *img_buffer_end;
155    uint8_t *img_buffer_original;
156 } rjpeg__context;
157 
158 static uint8_t *rjpeg__jpeg_load(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp);
159 
160 #define rjpeg__err(x,y)  0
161 
162 #define rjpeg__errpf(x,y)   ((float *) (rjpeg__err(x,y)?NULL:NULL))
163 #define rjpeg__errpuc(x,y)  ((unsigned char *) (rjpeg__err(x,y)?NULL:NULL))
164 
165 static int rjpeg__vertically_flip_on_load = 0;
166 
rjpeg__load_flip(rjpeg__context * s,unsigned * x,unsigned * y,int * comp,int req_comp)167 static unsigned char *rjpeg__load_flip(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp)
168 {
169    unsigned char *result = rjpeg__jpeg_load(s,x,y,comp,req_comp);
170 
171    if (rjpeg__vertically_flip_on_load && result != NULL)
172    {
173       int row,col,z;
174       int w     = *x, h = *y;
175       int depth = req_comp ? req_comp : *comp;
176 
177       for (row = 0; row < (h>>1); row++)
178       {
179          for (col = 0; col < w; col++)
180          {
181             for (z = 0; z < depth; z++)
182             {
183                uint8_t temp = result[(row * w + col) * depth + z];
184                result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
185                result[((h - row - 1) * w + col) * depth + z] = temp;
186             }
187          }
188       }
189    }
190 
191    return result;
192 }
193 
rjpeg_load_from_memory(const uint8_t * buffer,int len,unsigned * x,unsigned * y,int * comp,int req_comp)194 static uint8_t *rjpeg_load_from_memory(const uint8_t *buffer, int len, unsigned *x, unsigned *y, int *comp, int req_comp)
195 {
196    rjpeg__context s;
197    s.io.read             = NULL;
198    s.read_from_callbacks = 0;
199    s.img_buffer          = s.img_buffer_original = (uint8_t *) buffer;
200    s.img_buffer_end      = (uint8_t *) buffer+len;
201    return rjpeg__load_flip(&s,x,y,comp,req_comp);
202 }
203 
204 enum
205 {
206    RJPEG_SCAN_LOAD = 0,
207    RJPEG_SCAN_TYPE,
208    RJPEG_SCAN_HEADER
209 };
210 
rjpeg__refill_buffer(rjpeg__context * s)211 static void rjpeg__refill_buffer(rjpeg__context *s)
212 {
213    int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
214 
215    if (n == 0)
216    {
217       /* at end of file, treat same as if from memory, but need to handle case
218        * where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file */
219       s->read_from_callbacks = 0;
220       s->img_buffer = s->buffer_start;
221       s->img_buffer_end = s->buffer_start+1;
222       *s->img_buffer = 0;
223    }
224    else
225    {
226       s->img_buffer = s->buffer_start;
227       s->img_buffer_end = s->buffer_start + n;
228    }
229 }
230 
rjpeg__get8(rjpeg__context * s)231 static INLINE uint8_t rjpeg__get8(rjpeg__context *s)
232 {
233    if (s->img_buffer < s->img_buffer_end)
234       return *s->img_buffer++;
235 
236    if (s->read_from_callbacks)
237    {
238       rjpeg__refill_buffer(s);
239       return *s->img_buffer++;
240    }
241 
242    return 0;
243 }
244 
rjpeg__at_eof(rjpeg__context * s)245 static INLINE int rjpeg__at_eof(rjpeg__context *s)
246 {
247    if (s->io.read)
248    {
249       if (!(s->io.eof)(s->io_user_data))
250          return 0;
251 
252       /* if feof() is true, check if buffer = end
253        * special case: we've only got the special
254        * 0 character at the end */
255 
256       if (s->read_from_callbacks == 0)
257          return 1;
258    }
259 
260    return s->img_buffer >= s->img_buffer_end;
261 }
262 
rjpeg__skip(rjpeg__context * s,int n)263 static void rjpeg__skip(rjpeg__context *s, int n)
264 {
265    if (n < 0)
266    {
267       s->img_buffer = s->img_buffer_end;
268       return;
269    }
270 
271    if (s->io.read)
272    {
273       int blen = (int) (s->img_buffer_end - s->img_buffer);
274 
275       if (blen < n)
276       {
277          s->img_buffer = s->img_buffer_end;
278          (s->io.skip)(s->io_user_data, n - blen);
279          return;
280       }
281    }
282    s->img_buffer += n;
283 }
284 
rjpeg__get16be(rjpeg__context * s)285 static int rjpeg__get16be(rjpeg__context *s)
286 {
287    int z = rjpeg__get8(s);
288    return (z << 8) + rjpeg__get8(s);
289 }
290 
291 #define RJPEG__BYTECAST(x)  ((uint8_t) ((x) & 255))  /* truncate int to byte without warnings */
292 
293 /* huffman decoding acceleration */
294 #define FAST_BITS   9  /* larger handles more cases; smaller stomps less cache */
295 
296 typedef struct
297 {
298    uint8_t  fast[1 << FAST_BITS];
299    /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
300    uint16_t code[256];
301    uint8_t  values[256];
302    uint8_t  size[257];
303    unsigned int maxcode[18];
304    int    delta[17];   /* old 'firstsymbol' - old 'firstcode' */
305 } rjpeg__huffman;
306 
307 typedef struct
308 {
309    rjpeg__context *s;
310    rjpeg__huffman huff_dc[4];
311    rjpeg__huffman huff_ac[4];
312    uint8_t dequant[4][64];
313    int16_t fast_ac[4][1 << FAST_BITS];
314 
315    /* sizes for components, interleaved MCUs */
316    int img_h_max, img_v_max;
317    int img_mcu_x, img_mcu_y;
318    int img_mcu_w, img_mcu_h;
319 
320    /* definition of jpeg image component */
321    struct
322    {
323       int id;
324       int h,v;
325       int tq;
326       int hd,ha;
327       int dc_pred;
328 
329       int x,y,w2,h2;
330       uint8_t *data;
331       void *raw_data, *raw_coeff;
332       uint8_t *linebuf;
333       short   *coeff;            /* progressive only */
334       int      coeff_w, coeff_h; /* number of 8x8 coefficient blocks */
335    } img_comp[4];
336 
337    uint32_t       code_buffer; /* jpeg entropy-coded buffer */
338    int            code_bits;   /* number of valid bits */
339    unsigned char  marker;      /* marker seen while filling entropy buffer */
340    int            nomore;      /* flag if we saw a marker so must stop */
341 
342    int            progressive;
343    int            spec_start;
344    int            spec_end;
345    int            succ_high;
346    int            succ_low;
347    int            eob_run;
348 
349    int scan_n, order[4];
350    int restart_interval, todo;
351 
352    /* kernels */
353    void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
354    void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step);
355    uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs);
356 } rjpeg__jpeg;
357 
358 #define rjpeg__f2f(x)  ((int) (((x) * 4096 + 0.5)))
359 #define rjpeg__fsh(x)  ((x) << 12)
360 
361 #define RJPEG__MARKER_none  0xff
362 /* if there's a pending marker from the entropy stream, return that
363  * otherwise, fetch from the stream and get a marker. if there's no
364  * marker, return 0xff, which is never a valid marker value
365  */
366 
367 /* in each scan, we'll have scan_n components, and the order
368  * of the components is specified by order[]
369  */
370 #define RJPEG__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
371 
372 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
373 #define rjpeg__SOI(x)         ((x) == 0xd8)
374 #define rjpeg__EOI(x)         ((x) == 0xd9)
375 #define rjpeg__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
376 #define rjpeg__SOS(x)         ((x) == 0xda)
377 
378 #define rjpeg__SOF_progressive(x)   ((x) == 0xc2)
379 #define rjpeg__div4(x)              ((uint8_t) ((x) >> 2))
380 #define rjpeg__div16(x)             ((uint8_t) ((x) >> 4))
381 
rjpeg__build_huffman(rjpeg__huffman * h,int * count)382 static int rjpeg__build_huffman(rjpeg__huffman *h, int *count)
383 {
384    int i,j,k=0,code;
385 
386    /* build size list for each symbol (from JPEG spec) */
387    for (i=0; i < 16; ++i)
388       for (j=0; j < count[i]; ++j)
389          h->size[k++] = (uint8_t) (i+1);
390 
391    h->size[k] = 0;
392    /* compute actual symbols (from jpeg spec) */
393    code       = 0;
394    k          = 0;
395 
396    for(j=1; j <= 16; ++j)
397    {
398       /* compute delta to add to code to compute symbol id */
399       h->delta[j] = k - code;
400       if (h->size[k] == j)
401       {
402          while (h->size[k] == j)
403             h->code[k++] = (uint16_t) (code++);
404          if (code-1 >= (1 << j))
405             return rjpeg__err("bad code lengths","Corrupt JPEG");
406       }
407       /* compute largest code + 1 for this size, preshifted as needed later */
408       h->maxcode[j] = code << (16-j);
409       code <<= 1;
410    }
411    h->maxcode[j] = 0xffffffff;
412 
413    /* build non-spec acceleration table; 255 is flag for not-accelerated */
414    memset(h->fast, 255, 1 << FAST_BITS);
415    for (i=0; i < k; ++i)
416    {
417       int s = h->size[i];
418       if (s <= FAST_BITS)
419       {
420          int c = h->code[i] << (FAST_BITS-s);
421          int m = 1 << (FAST_BITS-s);
422          for (j=0; j < m; ++j)
423             h->fast[c+j] = (uint8_t) i;
424       }
425    }
426    return 1;
427 }
428 
429 /* build a table that decodes both magnitude and value of small ACs in
430  * one go. */
rjpeg__build_fast_ac(int16_t * fast_ac,rjpeg__huffman * h)431 static void rjpeg__build_fast_ac(int16_t *fast_ac, rjpeg__huffman *h)
432 {
433    int i;
434 
435    for (i=0; i < (1 << FAST_BITS); ++i)
436    {
437       uint8_t fast = h->fast[i];
438 
439       fast_ac[i] = 0;
440 
441       if (fast < 255)
442       {
443          int rs      = h->values[fast];
444          int run     = (rs >> 4) & 15;
445          int magbits = rs & 15;
446          int len     = h->size[fast];
447 
448          if (magbits && len + magbits <= FAST_BITS)
449          {
450             /* magnitude code followed by receive_extend code */
451             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
452             int m = 1 << (magbits - 1);
453             if (k < m)
454                k += (-1 << magbits) + 1;
455 
456             /* if the result is small enough, we can fit it in fast_ac table */
457             if (k >= -128 && k <= 127)
458                fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
459          }
460       }
461    }
462 }
463 
rjpeg__grow_buffer_unsafe(rjpeg__jpeg * j)464 static void rjpeg__grow_buffer_unsafe(rjpeg__jpeg *j)
465 {
466    do
467    {
468       int b = j->nomore ? 0 : rjpeg__get8(j->s);
469       if (b == 0xff)
470       {
471          int c = rjpeg__get8(j->s);
472 
473          if (c != 0)
474          {
475             j->marker = (unsigned char) c;
476             j->nomore = 1;
477             return;
478          }
479       }
480       j->code_buffer |= b << (24 - j->code_bits);
481       j->code_bits += 8;
482    } while (j->code_bits <= 24);
483 }
484 
485 /* (1 << n) - 1 */
486 static uint32_t rjpeg__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
487 
488 /* decode a JPEG huffman value from the bitstream */
rjpeg__jpeg_huff_decode(rjpeg__jpeg * j,rjpeg__huffman * h)489 static INLINE int rjpeg__jpeg_huff_decode(rjpeg__jpeg *j, rjpeg__huffman *h)
490 {
491    unsigned int temp;
492    int c,k;
493 
494    if (j->code_bits < 16)
495       rjpeg__grow_buffer_unsafe(j);
496 
497    /* look at the top FAST_BITS and determine what symbol ID it is,
498     * if the code is <= FAST_BITS */
499    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
500    k = h->fast[c];
501 
502    if (k < 255)
503    {
504       int s = h->size[k];
505       if (s > j->code_bits)
506          return -1;
507       j->code_buffer <<= s;
508       j->code_bits -= s;
509       return h->values[k];
510    }
511 
512    /* naive test is to shift the code_buffer down so k bits are
513     * valid, then test against maxcode. To speed this up, we've
514     * preshifted maxcode left so that it has (16-k) 0s at the
515     * end; in other words, regardless of the number of bits, it
516     * wants to be compared against something shifted to have 16;
517     * that way we don't need to shift inside the loop. */
518    temp = j->code_buffer >> 16;
519    for (k=FAST_BITS+1 ; ; ++k)
520       if (temp < h->maxcode[k])
521          break;
522 
523    if (k == 17)
524    {
525       /* error! code not found */
526       j->code_bits -= 16;
527       return -1;
528    }
529 
530    if (k > j->code_bits)
531       return -1;
532 
533    /* convert the huffman code to the symbol id */
534    c = ((j->code_buffer >> (32 - k)) & rjpeg__bmask[k]) + h->delta[k];
535    assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg__bmask[h->size[c]]) == h->code[c]);
536 
537    /* convert the id to a symbol */
538    j->code_bits -= k;
539    j->code_buffer <<= k;
540    return h->values[c];
541 }
542 
543 /* bias[n] = (-1<<n) + 1 */
544 static int const rjpeg__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
545 
546 /* combined JPEG 'receive' and JPEG 'extend', since baseline
547  * always extends everything it receives. */
rjpeg__extend_receive(rjpeg__jpeg * j,int n)548 static INLINE int rjpeg__extend_receive(rjpeg__jpeg *j, int n)
549 {
550    unsigned int k;
551    int sgn;
552    if (j->code_bits < n)
553       rjpeg__grow_buffer_unsafe(j);
554 
555    sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
556    k = rjpeg_lrot(j->code_buffer, n);
557    assert(n >= 0 && n < (int) (sizeof(rjpeg__bmask)/sizeof(*rjpeg__bmask)));
558    j->code_buffer = k & ~rjpeg__bmask[n];
559    k &= rjpeg__bmask[n];
560    j->code_bits -= n;
561    return k + (rjpeg__jbias[n] & ~sgn);
562 }
563 
564 /* get some unsigned bits */
rjpeg__jpeg_get_bits(rjpeg__jpeg * j,int n)565 static INLINE int rjpeg__jpeg_get_bits(rjpeg__jpeg *j, int n)
566 {
567    unsigned int k;
568    if (j->code_bits < n) rjpeg__grow_buffer_unsafe(j);
569    k = rjpeg_lrot(j->code_buffer, n);
570    j->code_buffer = k & ~rjpeg__bmask[n];
571    k &= rjpeg__bmask[n];
572    j->code_bits -= n;
573    return k;
574 }
575 
rjpeg__jpeg_get_bit(rjpeg__jpeg * j)576 static INLINE int rjpeg__jpeg_get_bit(rjpeg__jpeg *j)
577 {
578    unsigned int k;
579    if (j->code_bits < 1) rjpeg__grow_buffer_unsafe(j);
580    k = j->code_buffer;
581    j->code_buffer <<= 1;
582    --j->code_bits;
583    return k & 0x80000000;
584 }
585 
586 /* given a value that's at position X in the zigzag stream,
587  * where does it appear in the 8x8 matrix coded as row-major? */
588 static uint8_t rjpeg__jpeg_dezigzag[64+15] =
589 {
590     0,  1,  8, 16,  9,  2,  3, 10,
591    17, 24, 32, 25, 18, 11,  4,  5,
592    12, 19, 26, 33, 40, 48, 41, 34,
593    27, 20, 13,  6,  7, 14, 21, 28,
594    35, 42, 49, 56, 57, 50, 43, 36,
595    29, 22, 15, 23, 30, 37, 44, 51,
596    58, 59, 52, 45, 38, 31, 39, 46,
597    53, 60, 61, 54, 47, 55, 62, 63,
598    /* let corrupt input sample past end */
599    63, 63, 63, 63, 63, 63, 63, 63,
600    63, 63, 63, 63, 63, 63, 63
601 };
602 
603 /* decode one 64-entry block-- */
rjpeg__jpeg_decode_block(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,rjpeg__huffman * hac,int16_t * fac,int b,uint8_t * dequant)604 static int rjpeg__jpeg_decode_block(
605       rjpeg__jpeg *j, short data[64],
606       rjpeg__huffman *hdc,
607       rjpeg__huffman *hac,
608       int16_t *fac,
609       int b,
610       uint8_t *dequant)
611 {
612    int diff,dc,k;
613    int t;
614 
615    if (j->code_bits < 16)
616       rjpeg__grow_buffer_unsafe(j);
617    t = rjpeg__jpeg_huff_decode(j, hdc);
618    if (t < 0)
619       return rjpeg__err("bad huffman code","Corrupt JPEG");
620 
621    /* 0 all the ac values now so we can do it 32-bits at a time */
622    memset(data,0,64*sizeof(data[0]));
623 
624    diff = t ? rjpeg__extend_receive(j, t) : 0;
625    dc = j->img_comp[b].dc_pred + diff;
626    j->img_comp[b].dc_pred = dc;
627    data[0] = (short) (dc * dequant[0]);
628 
629    /* decode AC components, see JPEG spec */
630    k = 1;
631    do
632    {
633       unsigned int zig;
634       int c,r,s;
635       if (j->code_bits < 16)
636          rjpeg__grow_buffer_unsafe(j);
637       c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
638       r = fac[c];
639       if (r)
640       {
641          /* fast-AC path */
642          k += (r >> 4) & 15; /* run */
643          s = r & 15;         /* combined length */
644          j->code_buffer <<= s;
645          j->code_bits -= s;
646          /* decode into unzigzag'd location */
647          zig = rjpeg__jpeg_dezigzag[k++];
648          data[zig] = (short) ((r >> 8) * dequant[zig]);
649       }
650       else
651       {
652          int rs = rjpeg__jpeg_huff_decode(j, hac);
653          if (rs < 0)
654             return rjpeg__err("bad huffman code","Corrupt JPEG");
655          s = rs & 15;
656          r = rs >> 4;
657          if (s == 0)
658          {
659             if (rs != 0xf0)
660                break; /* end block */
661             k += 16;
662          }
663          else
664          {
665             k += r;
666             /* decode into unzigzag'd location */
667             zig = rjpeg__jpeg_dezigzag[k++];
668             data[zig] = (short) (rjpeg__extend_receive(j,s) * dequant[zig]);
669          }
670       }
671    } while (k < 64);
672    return 1;
673 }
674 
rjpeg__jpeg_decode_block_prog_dc(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,int b)675 static int rjpeg__jpeg_decode_block_prog_dc(
676       rjpeg__jpeg *j,
677       short data[64],
678       rjpeg__huffman *hdc,
679       int b)
680 {
681    if (j->spec_end != 0)
682       return rjpeg__err("can't merge dc and ac", "Corrupt JPEG");
683 
684    if (j->code_bits < 16)
685       rjpeg__grow_buffer_unsafe(j);
686 
687    if (j->succ_high == 0)
688    {
689       int t;
690       int diff,dc;
691 
692       /* first scan for DC coefficient, must be first */
693       memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
694       t = rjpeg__jpeg_huff_decode(j, hdc);
695       diff = t ? rjpeg__extend_receive(j, t) : 0;
696 
697       dc = j->img_comp[b].dc_pred + diff;
698       j->img_comp[b].dc_pred = dc;
699       data[0] = (short) (dc << j->succ_low);
700    }
701    else
702    {
703       /* refinement scan for DC coefficient */
704       if (rjpeg__jpeg_get_bit(j))
705          data[0] += (short) (1 << j->succ_low);
706    }
707    return 1;
708 }
709 
rjpeg__jpeg_decode_block_prog_ac(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hac,int16_t * fac)710 static int rjpeg__jpeg_decode_block_prog_ac(
711       rjpeg__jpeg *j,
712       short data[64],
713       rjpeg__huffman *hac,
714       int16_t *fac)
715 {
716    int k;
717    if (j->spec_start == 0)
718       return rjpeg__err("can't merge dc and ac", "Corrupt JPEG");
719 
720    if (j->succ_high == 0)
721    {
722       int shift = j->succ_low;
723 
724       if (j->eob_run)
725       {
726          --j->eob_run;
727          return 1;
728       }
729 
730       k = j->spec_start;
731       do {
732          unsigned int zig;
733          int c,r,s;
734          if (j->code_bits < 16) rjpeg__grow_buffer_unsafe(j);
735          c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
736          r = fac[c];
737          if (r)
738          { /* fast-AC path */
739             k += (r >> 4) & 15; /* run */
740             s = r & 15;         /* combined length */
741             j->code_buffer <<= s;
742             j->code_bits -= s;
743             zig = rjpeg__jpeg_dezigzag[k++];
744             data[zig] = (short) ((r >> 8) << shift);
745          }
746          else
747          {
748             int rs = rjpeg__jpeg_huff_decode(j, hac);
749             if (rs < 0) return rjpeg__err("bad huffman code","Corrupt JPEG");
750             s = rs & 15;
751             r = rs >> 4;
752             if (s == 0)
753             {
754                if (r < 15)
755                {
756                   j->eob_run = (1 << r);
757                   if (r)
758                      j->eob_run += rjpeg__jpeg_get_bits(j, r);
759                   --j->eob_run;
760                   break;
761                }
762                k += 16;
763             } else {
764                k += r;
765                zig = rjpeg__jpeg_dezigzag[k++];
766                data[zig] = (short) (rjpeg__extend_receive(j,s) << shift);
767             }
768          }
769       } while (k <= j->spec_end);
770    } else {
771       /* refinement scan for these AC coefficients */
772 
773       short bit = (short) (1 << j->succ_low);
774 
775       if (j->eob_run)
776       {
777          --j->eob_run;
778          for (k = j->spec_start; k <= j->spec_end; ++k)
779          {
780             short *p = &data[rjpeg__jpeg_dezigzag[k]];
781             if (*p != 0)
782                if (rjpeg__jpeg_get_bit(j))
783                   if ((*p & bit)==0)
784                   {
785                      if (*p > 0)
786                         *p += bit;
787                      else
788                         *p -= bit;
789                   }
790          }
791       } else {
792          k = j->spec_start;
793          do {
794             int r,s;
795             int rs = rjpeg__jpeg_huff_decode(j, hac);
796             if (rs < 0) return rjpeg__err("bad huffman code","Corrupt JPEG");
797             s = rs & 15;
798             r = rs >> 4;
799             if (s == 0)
800             {
801                if (r < 15)
802                {
803                   j->eob_run = (1 << r) - 1;
804                   if (r)
805                      j->eob_run += rjpeg__jpeg_get_bits(j, r);
806                   r = 64; /* force end of block */
807                } else {
808                   /* r=15 s=0 should write 16 0s, so we just do
809                    * a run of 15 0s and then write s (which is 0),
810                    * so we don't have to do anything special here */
811                }
812             } else {
813                if (s != 1) return rjpeg__err("bad huffman code", "Corrupt JPEG");
814                /* sign bit */
815                if (rjpeg__jpeg_get_bit(j))
816                   s = bit;
817                else
818                   s = -bit;
819             }
820 
821             /* advance by r */
822             while (k <= j->spec_end)
823             {
824                short *p = &data[rjpeg__jpeg_dezigzag[k++]];
825                if (*p != 0)
826                {
827                   if (rjpeg__jpeg_get_bit(j))
828                      if ((*p & bit)==0)
829                      {
830                         if (*p > 0)
831                            *p += bit;
832                         else
833                            *p -= bit;
834                      }
835                }
836                else
837                {
838                   if (r == 0)
839                   {
840                      *p = (short) s;
841                      break;
842                   }
843                   --r;
844                }
845             }
846          } while (k <= j->spec_end);
847       }
848    }
849    return 1;
850 }
851 
852 /* take a -128..127 value and rjpeg__clamp it and convert to 0..255 */
rjpeg__clamp(int x)853 static INLINE uint8_t rjpeg__clamp(int x)
854 {
855    /* trick to use a single test to catch both cases */
856    if ((unsigned int) x > 255)
857       return 255;
858    return (uint8_t) x;
859 }
860 
861 
862 /* derived from jidctint -- DCT_ISLOW */
863 #define RJPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
864    int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
865    p2 = s2;                                    \
866    p3 = s6;                                    \
867    p1 = (p2+p3) * rjpeg__f2f(0.5411961f);       \
868    t2 = p1 + p3*rjpeg__f2f(-1.847759065f);      \
869    t3 = p1 + p2*rjpeg__f2f( 0.765366865f);      \
870    p2 = s0;                                    \
871    p3 = s4;                                    \
872    t0 = rjpeg__fsh(p2+p3);                      \
873    t1 = rjpeg__fsh(p2-p3);                      \
874    x0 = t0+t3;                                 \
875    x3 = t0-t3;                                 \
876    x1 = t1+t2;                                 \
877    x2 = t1-t2;                                 \
878    t0 = s7;                                    \
879    t1 = s5;                                    \
880    t2 = s3;                                    \
881    t3 = s1;                                    \
882    p3 = t0+t2;                                 \
883    p4 = t1+t3;                                 \
884    p1 = t0+t3;                                 \
885    p2 = t1+t2;                                 \
886    p5 = (p3+p4)*rjpeg__f2f( 1.175875602f);      \
887    t0 = t0*rjpeg__f2f( 0.298631336f);           \
888    t1 = t1*rjpeg__f2f( 2.053119869f);           \
889    t2 = t2*rjpeg__f2f( 3.072711026f);           \
890    t3 = t3*rjpeg__f2f( 1.501321110f);           \
891    p1 = p5 + p1*rjpeg__f2f(-0.899976223f);      \
892    p2 = p5 + p2*rjpeg__f2f(-2.562915447f);      \
893    p3 = p3*rjpeg__f2f(-1.961570560f);           \
894    p4 = p4*rjpeg__f2f(-0.390180644f);           \
895    t3 += p1+p4;                                \
896    t2 += p2+p3;                                \
897    t1 += p2+p4;                                \
898    t0 += p1+p3;
899 
rjpeg__idct_block(uint8_t * out,int out_stride,short data[64])900 static void rjpeg__idct_block(uint8_t *out, int out_stride, short data[64])
901 {
902    int i,val[64],*v=val;
903    uint8_t   *o = NULL;
904    int16_t   *d = data;
905 
906    /* columns */
907    for (i=0; i < 8; ++i,++d, ++v)
908    {
909       /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
910       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
911            && d[40]==0 && d[48]==0 && d[56]==0)
912       {
913          /*    no shortcut                 0     seconds
914           *    (1|2|3|4|5|6|7)==0          0     seconds
915           *    all separate               -0.047 seconds
916           *    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds */
917          int dcterm = d[0] << 2;
918          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
919       }
920       else
921       {
922          RJPEG__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
923          /* constants scaled things up by 1<<12; let's bring them back
924           * down, but keep 2 extra bits of precision */
925          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
926          v[ 0] = (x0+t3) >> 10;
927          v[56] = (x0-t3) >> 10;
928          v[ 8] = (x1+t2) >> 10;
929          v[48] = (x1-t2) >> 10;
930          v[16] = (x2+t1) >> 10;
931          v[40] = (x2-t1) >> 10;
932          v[24] = (x3+t0) >> 10;
933          v[32] = (x3-t0) >> 10;
934       }
935    }
936 
937    for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
938    {
939       /* no fast case since the first 1D IDCT spread components out */
940       RJPEG__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
941          /* constants scaled things up by 1<<12, plus we had 1<<2 from first
942           * loop, plus horizontal and vertical each scale by sqrt(8) so together
943           * we've got an extra 1<<3, so 1<<17 total we need to remove.
944           * so we want to round that, which means adding 0.5 * 1<<17,
945           * aka 65536. Also, we'll end up with -128 to 127 that we want
946           * to encode as 0..255 by adding 128, so we'll add that before the shift */
947          x0 += 65536 + (128<<17);
948       x1 += 65536 + (128<<17);
949       x2 += 65536 + (128<<17);
950       x3 += 65536 + (128<<17);
951       /* tried computing the shifts into temps, or'ing the temps to see
952        * if any were out of range, but that was slower */
953       o[0] = rjpeg__clamp((x0+t3) >> 17);
954       o[7] = rjpeg__clamp((x0-t3) >> 17);
955       o[1] = rjpeg__clamp((x1+t2) >> 17);
956       o[6] = rjpeg__clamp((x1-t2) >> 17);
957       o[2] = rjpeg__clamp((x2+t1) >> 17);
958       o[5] = rjpeg__clamp((x2-t1) >> 17);
959       o[3] = rjpeg__clamp((x3+t0) >> 17);
960       o[4] = rjpeg__clamp((x3-t0) >> 17);
961    }
962 }
963 
964 #if defined(__SSE2__)
965 /* sse2 integer IDCT. not the fastest possible implementation but it
966  * produces bit-identical results to the generic C version so it's
967  * fully "transparent".
968  */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])969 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
970 {
971    /* This is constructed to match our regular (generic) integer IDCT exactly. */
972    __m128i row0, row1, row2, row3, row4, row5, row6, row7;
973    __m128i tmp;
974 
975    /* dot product constant: even elems=x, odd elems=y */
976    #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
977 
978    /* out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
979     * out(1) = c1[even]*x + c1[odd]*y
980     */
981    #define dct_rot(out0,out1, x,y,c0,c1) \
982       __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
983       __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
984       __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
985       __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
986       __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
987       __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
988 
989    /* out = in << 12  (in 16-bit, out 32-bit) */
990    #define dct_widen(out, in) \
991       __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
992       __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
993 
994    /* wide add */
995    #define dct_wadd(out, a, b) \
996       __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
997       __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
998 
999    /* wide sub */
1000    #define dct_wsub(out, a, b) \
1001       __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
1002       __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
1003 
1004    /* butterfly a/b, add bias, then shift by "s" and pack */
1005    #define dct_bfly32o(out0, out1, a,b,bias,s) \
1006       { \
1007          __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
1008          __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
1009          dct_wadd(sum, abiased, b); \
1010          dct_wsub(dif, abiased, b); \
1011          out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
1012          out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
1013       }
1014 
1015    /* 8-bit interleave step (for transposes) */
1016    #define dct_interleave8(a, b) \
1017       tmp = a; \
1018       a = _mm_unpacklo_epi8(a, b); \
1019       b = _mm_unpackhi_epi8(tmp, b)
1020 
1021    /* 16-bit interleave step (for transposes) */
1022    #define dct_interleave16(a, b) \
1023       tmp = a; \
1024       a = _mm_unpacklo_epi16(a, b); \
1025       b = _mm_unpackhi_epi16(tmp, b)
1026 
1027    #define dct_pass(bias,shift) \
1028       { \
1029          /* even part */ \
1030          dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
1031          __m128i sum04 = _mm_add_epi16(row0, row4); \
1032          __m128i dif04 = _mm_sub_epi16(row0, row4); \
1033          dct_widen(t0e, sum04); \
1034          dct_widen(t1e, dif04); \
1035          dct_wadd(x0, t0e, t3e); \
1036          dct_wsub(x3, t0e, t3e); \
1037          dct_wadd(x1, t1e, t2e); \
1038          dct_wsub(x2, t1e, t2e); \
1039          /* odd part */ \
1040          dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
1041          dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
1042          __m128i sum17 = _mm_add_epi16(row1, row7); \
1043          __m128i sum35 = _mm_add_epi16(row3, row5); \
1044          dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
1045          dct_wadd(x4, y0o, y4o); \
1046          dct_wadd(x5, y1o, y5o); \
1047          dct_wadd(x6, y2o, y5o); \
1048          dct_wadd(x7, y3o, y4o); \
1049          dct_bfly32o(row0,row7, x0,x7,bias,shift); \
1050          dct_bfly32o(row1,row6, x1,x6,bias,shift); \
1051          dct_bfly32o(row2,row5, x2,x5,bias,shift); \
1052          dct_bfly32o(row3,row4, x3,x4,bias,shift); \
1053       }
1054 
1055    __m128i rot0_0 = dct_const(rjpeg__f2f(0.5411961f), rjpeg__f2f(0.5411961f) + rjpeg__f2f(-1.847759065f));
1056    __m128i rot0_1 = dct_const(rjpeg__f2f(0.5411961f) + rjpeg__f2f( 0.765366865f), rjpeg__f2f(0.5411961f));
1057    __m128i rot1_0 = dct_const(rjpeg__f2f(1.175875602f) + rjpeg__f2f(-0.899976223f), rjpeg__f2f(1.175875602f));
1058    __m128i rot1_1 = dct_const(rjpeg__f2f(1.175875602f), rjpeg__f2f(1.175875602f) + rjpeg__f2f(-2.562915447f));
1059    __m128i rot2_0 = dct_const(rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 0.298631336f), rjpeg__f2f(-1.961570560f));
1060    __m128i rot2_1 = dct_const(rjpeg__f2f(-1.961570560f), rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 3.072711026f));
1061    __m128i rot3_0 = dct_const(rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 2.053119869f), rjpeg__f2f(-0.390180644f));
1062    __m128i rot3_1 = dct_const(rjpeg__f2f(-0.390180644f), rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 1.501321110f));
1063 
1064    /* rounding biases in column/row passes, see rjpeg__idct_block for explanation. */
1065    __m128i bias_0 = _mm_set1_epi32(512);
1066    __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
1067 
1068    /* load */
1069    row0 = _mm_load_si128((const __m128i *) (data + 0*8));
1070    row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1071    row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1072    row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1073    row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1074    row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1075    row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1076    row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1077 
1078    /* column pass */
1079    dct_pass(bias_0, 10);
1080 
1081    {
1082       /* 16bit 8x8 transpose pass 1 */
1083       dct_interleave16(row0, row4);
1084       dct_interleave16(row1, row5);
1085       dct_interleave16(row2, row6);
1086       dct_interleave16(row3, row7);
1087 
1088       /* transpose pass 2 */
1089       dct_interleave16(row0, row2);
1090       dct_interleave16(row1, row3);
1091       dct_interleave16(row4, row6);
1092       dct_interleave16(row5, row7);
1093 
1094       /* transpose pass 3 */
1095       dct_interleave16(row0, row1);
1096       dct_interleave16(row2, row3);
1097       dct_interleave16(row4, row5);
1098       dct_interleave16(row6, row7);
1099    }
1100 
1101    /* row pass */
1102    dct_pass(bias_1, 17);
1103 
1104    {
1105       /* pack */
1106       __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1107       __m128i p1 = _mm_packus_epi16(row2, row3);
1108       __m128i p2 = _mm_packus_epi16(row4, row5);
1109       __m128i p3 = _mm_packus_epi16(row6, row7);
1110 
1111       /* 8bit 8x8 transpose pass 1 */
1112       dct_interleave8(p0, p2); /* a0e0a1e1... */
1113       dct_interleave8(p1, p3); /* c0g0c1g1... */
1114 
1115       /* transpose pass 2 */
1116       dct_interleave8(p0, p1); /* a0c0e0g0... */
1117       dct_interleave8(p2, p3); /* b0d0f0h0... */
1118 
1119       /* transpose pass 3 */
1120       dct_interleave8(p0, p2); /* a0b0c0d0... */
1121       dct_interleave8(p1, p3); /* a4b4c4d4... */
1122 
1123       /* store */
1124       _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1125       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1126       _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1127       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1128       _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1129       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1130       _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1131       _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1132    }
1133 
1134 #undef dct_const
1135 #undef dct_rot
1136 #undef dct_widen
1137 #undef dct_wadd
1138 #undef dct_wsub
1139 #undef dct_bfly32o
1140 #undef dct_interleave8
1141 #undef dct_interleave16
1142 #undef dct_pass
1143 }
1144 
1145 #endif
1146 
1147 #ifdef RJPEG_NEON
1148 
1149 /* NEON integer IDCT. should produce bit-identical
1150  * results to the generic C version. */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])1151 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
1152 {
1153    int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1154 
1155    int16x4_t rot0_0 = vdup_n_s16(rjpeg__f2f(0.5411961f));
1156    int16x4_t rot0_1 = vdup_n_s16(rjpeg__f2f(-1.847759065f));
1157    int16x4_t rot0_2 = vdup_n_s16(rjpeg__f2f( 0.765366865f));
1158    int16x4_t rot1_0 = vdup_n_s16(rjpeg__f2f( 1.175875602f));
1159    int16x4_t rot1_1 = vdup_n_s16(rjpeg__f2f(-0.899976223f));
1160    int16x4_t rot1_2 = vdup_n_s16(rjpeg__f2f(-2.562915447f));
1161    int16x4_t rot2_0 = vdup_n_s16(rjpeg__f2f(-1.961570560f));
1162    int16x4_t rot2_1 = vdup_n_s16(rjpeg__f2f(-0.390180644f));
1163    int16x4_t rot3_0 = vdup_n_s16(rjpeg__f2f( 0.298631336f));
1164    int16x4_t rot3_1 = vdup_n_s16(rjpeg__f2f( 2.053119869f));
1165    int16x4_t rot3_2 = vdup_n_s16(rjpeg__f2f( 3.072711026f));
1166    int16x4_t rot3_3 = vdup_n_s16(rjpeg__f2f( 1.501321110f));
1167 
1168 #define dct_long_mul(out, inq, coeff) \
1169    int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1170    int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1171 
1172 #define dct_long_mac(out, acc, inq, coeff) \
1173    int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1174    int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1175 
1176 #define dct_widen(out, inq) \
1177    int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1178    int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1179 
1180 /* wide add */
1181 #define dct_wadd(out, a, b) \
1182    int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1183    int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1184 
1185 /* wide sub */
1186 #define dct_wsub(out, a, b) \
1187    int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1188    int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1189 
1190 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1191 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1192    { \
1193       dct_wadd(sum, a, b); \
1194       dct_wsub(dif, a, b); \
1195       out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1196       out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1197    }
1198 
1199 #define dct_pass(shiftop, shift) \
1200    { \
1201       /* even part */ \
1202       int16x8_t sum26 = vaddq_s16(row2, row6); \
1203       dct_long_mul(p1e, sum26, rot0_0); \
1204       dct_long_mac(t2e, p1e, row6, rot0_1); \
1205       dct_long_mac(t3e, p1e, row2, rot0_2); \
1206       int16x8_t sum04 = vaddq_s16(row0, row4); \
1207       int16x8_t dif04 = vsubq_s16(row0, row4); \
1208       dct_widen(t0e, sum04); \
1209       dct_widen(t1e, dif04); \
1210       dct_wadd(x0, t0e, t3e); \
1211       dct_wsub(x3, t0e, t3e); \
1212       dct_wadd(x1, t1e, t2e); \
1213       dct_wsub(x2, t1e, t2e); \
1214       /* odd part */ \
1215       int16x8_t sum15 = vaddq_s16(row1, row5); \
1216       int16x8_t sum17 = vaddq_s16(row1, row7); \
1217       int16x8_t sum35 = vaddq_s16(row3, row5); \
1218       int16x8_t sum37 = vaddq_s16(row3, row7); \
1219       int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1220       dct_long_mul(p5o, sumodd, rot1_0); \
1221       dct_long_mac(p1o, p5o, sum17, rot1_1); \
1222       dct_long_mac(p2o, p5o, sum35, rot1_2); \
1223       dct_long_mul(p3o, sum37, rot2_0); \
1224       dct_long_mul(p4o, sum15, rot2_1); \
1225       dct_wadd(sump13o, p1o, p3o); \
1226       dct_wadd(sump24o, p2o, p4o); \
1227       dct_wadd(sump23o, p2o, p3o); \
1228       dct_wadd(sump14o, p1o, p4o); \
1229       dct_long_mac(x4, sump13o, row7, rot3_0); \
1230       dct_long_mac(x5, sump24o, row5, rot3_1); \
1231       dct_long_mac(x6, sump23o, row3, rot3_2); \
1232       dct_long_mac(x7, sump14o, row1, rot3_3); \
1233       dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1234       dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1235       dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1236       dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1237    }
1238 
1239    /* load */
1240    row0 = vld1q_s16(data + 0*8);
1241    row1 = vld1q_s16(data + 1*8);
1242    row2 = vld1q_s16(data + 2*8);
1243    row3 = vld1q_s16(data + 3*8);
1244    row4 = vld1q_s16(data + 4*8);
1245    row5 = vld1q_s16(data + 5*8);
1246    row6 = vld1q_s16(data + 6*8);
1247    row7 = vld1q_s16(data + 7*8);
1248 
1249    /* add DC bias */
1250    row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1251 
1252    /* column pass */
1253    dct_pass(vrshrn_n_s32, 10);
1254 
1255    /* 16bit 8x8 transpose */
1256    {
1257 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1258  * whether compilers actually get this is another story, sadly. */
1259 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1260 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1261 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1262 
1263       /* pass 1 */
1264       dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1265       dct_trn16(row2, row3);
1266       dct_trn16(row4, row5);
1267       dct_trn16(row6, row7);
1268 
1269       /* pass 2 */
1270       dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1271       dct_trn32(row1, row3);
1272       dct_trn32(row4, row6);
1273       dct_trn32(row5, row7);
1274 
1275       /* pass 3 */
1276       dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1277       dct_trn64(row1, row5);
1278       dct_trn64(row2, row6);
1279       dct_trn64(row3, row7);
1280 
1281 #undef dct_trn16
1282 #undef dct_trn32
1283 #undef dct_trn64
1284    }
1285 
1286    /* row pass
1287     * vrshrn_n_s32 only supports shifts up to 16, we need
1288     * 17. so do a non-rounding shift of 16 first then follow
1289     * up with a rounding shift by 1. */
1290    dct_pass(vshrn_n_s32, 16);
1291 
1292    {
1293       /* pack and round */
1294       uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1295       uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1296       uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1297       uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1298       uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1299       uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1300       uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1301       uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1302 
1303       /* again, these can translate into one instruction, but often don't. */
1304 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1305 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1306 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1307 
1308       /* sadly can't use interleaved stores here since we only write
1309        * 8 bytes to each scan line! */
1310 
1311       /* 8x8 8-bit transpose pass 1 */
1312       dct_trn8_8(p0, p1);
1313       dct_trn8_8(p2, p3);
1314       dct_trn8_8(p4, p5);
1315       dct_trn8_8(p6, p7);
1316 
1317       /* pass 2 */
1318       dct_trn8_16(p0, p2);
1319       dct_trn8_16(p1, p3);
1320       dct_trn8_16(p4, p6);
1321       dct_trn8_16(p5, p7);
1322 
1323       /* pass 3 */
1324       dct_trn8_32(p0, p4);
1325       dct_trn8_32(p1, p5);
1326       dct_trn8_32(p2, p6);
1327       dct_trn8_32(p3, p7);
1328 
1329       /* store */
1330       vst1_u8(out, p0); out += out_stride;
1331       vst1_u8(out, p1); out += out_stride;
1332       vst1_u8(out, p2); out += out_stride;
1333       vst1_u8(out, p3); out += out_stride;
1334       vst1_u8(out, p4); out += out_stride;
1335       vst1_u8(out, p5); out += out_stride;
1336       vst1_u8(out, p6); out += out_stride;
1337       vst1_u8(out, p7);
1338 
1339 #undef dct_trn8_8
1340 #undef dct_trn8_16
1341 #undef dct_trn8_32
1342    }
1343 
1344 #undef dct_long_mul
1345 #undef dct_long_mac
1346 #undef dct_widen
1347 #undef dct_wadd
1348 #undef dct_wsub
1349 #undef dct_bfly32o
1350 #undef dct_pass
1351 }
1352 
1353 #endif /* RJPEG_NEON */
1354 
rjpeg__get_marker(rjpeg__jpeg * j)1355 static uint8_t rjpeg__get_marker(rjpeg__jpeg *j)
1356 {
1357    uint8_t x;
1358    if (j->marker != RJPEG__MARKER_none)
1359    {
1360       x = j->marker;
1361       j->marker = RJPEG__MARKER_none;
1362       return x;
1363    }
1364 
1365    x = rjpeg__get8(j->s);
1366    if (x != 0xff)
1367       return RJPEG__MARKER_none;
1368    while (x == 0xff)
1369       x = rjpeg__get8(j->s);
1370    return x;
1371 }
1372 
1373 
1374 /* after a restart interval, rjpeg__jpeg_reset the entropy decoder and
1375  * the dc prediction
1376  */
rjpeg__jpeg_reset(rjpeg__jpeg * j)1377 static void rjpeg__jpeg_reset(rjpeg__jpeg *j)
1378 {
1379    j->code_bits = 0;
1380    j->code_buffer = 0;
1381    j->nomore = 0;
1382    j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
1383    j->marker = RJPEG__MARKER_none;
1384    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1385    j->eob_run = 0;
1386    /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1387     * since we don't even allow 1<<30 pixels */
1388 }
1389 
rjpeg__parse_entropy_coded_data(rjpeg__jpeg * z)1390 static int rjpeg__parse_entropy_coded_data(rjpeg__jpeg *z)
1391 {
1392    rjpeg__jpeg_reset(z);
1393    if (!z->progressive)
1394    {
1395       if (z->scan_n == 1)
1396       {
1397          int i,j;
1398          RJPEG_SIMD_ALIGN(short, data[64]);
1399          int n = z->order[0];
1400          /* non-interleaved data, we just need to process one block at a time,
1401           * in trivial scanline order
1402           * number of blocks to do just depends on how many actual "pixels" this
1403           * component has, independent of interleaved MCU blocking and such */
1404          int w = (z->img_comp[n].x+7) >> 3;
1405          int h = (z->img_comp[n].y+7) >> 3;
1406 
1407          for (j=0; j < h; ++j)
1408          {
1409             for (i=0; i < w; ++i)
1410             {
1411                int ha = z->img_comp[n].ha;
1412                if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
1413                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
1414                /* every data block is an MCU, so countdown the restart interval */
1415                if (--z->todo <= 0)
1416                {
1417                   if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1418                   /* if it's NOT a restart, then just bail,
1419                    * so we get corrupt data rather than no data */
1420                   if (!RJPEG__RESTART(z->marker)) return 1;
1421                   rjpeg__jpeg_reset(z);
1422                }
1423             }
1424          }
1425       }
1426       else
1427       {
1428          /* interleaved */
1429          int i,j,k,x,y;
1430          RJPEG_SIMD_ALIGN(short, data[64]);
1431          for (j=0; j < z->img_mcu_y; ++j)
1432          {
1433             for (i=0; i < z->img_mcu_x; ++i)
1434             {
1435                /* scan an interleaved mcu...
1436                 * process scan_n components in order */
1437                for (k=0; k < z->scan_n; ++k)
1438                {
1439                   int n = z->order[k];
1440                   /* scan out an mcu's worth of this component;
1441                    * that's just determined by the basic H
1442                    * and V specified for the component */
1443                   for (y=0; y < z->img_comp[n].v; ++y)
1444                   {
1445                      for (x=0; x < z->img_comp[n].h; ++x)
1446                      {
1447                         int x2 = (i*z->img_comp[n].h + x)*8;
1448                         int y2 = (j*z->img_comp[n].v + y)*8;
1449                         int ha = z->img_comp[n].ha;
1450                         if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1451                            return 0;
1452                         z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
1453                      }
1454                   }
1455                }
1456                /* after all interleaved components, that's an interleaved MCU,
1457                 * so now count down the restart interval */
1458                if (--z->todo <= 0)
1459                {
1460                   if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1461                   if (!RJPEG__RESTART(z->marker)) return 1;
1462                   rjpeg__jpeg_reset(z);
1463                }
1464             }
1465          }
1466       }
1467       return 1;
1468    }
1469    else
1470    {
1471       if (z->scan_n == 1)
1472       {
1473          int i,j;
1474          int n = z->order[0];
1475          int w = (z->img_comp[n].x+7) >> 3;
1476          int h = (z->img_comp[n].y+7) >> 3;
1477 
1478          /* non-interleaved data, we just need to process one block at a time,
1479           * in trivial scanline order
1480           * number of blocks to do just depends on how many actual "pixels" this
1481           * component has, independent of interleaved MCU blocking and such */
1482 
1483          for (j=0; j < h; ++j)
1484          {
1485             for (i=0; i < w; ++i)
1486             {
1487                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1488                if (z->spec_start == 0)
1489                {
1490                   if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1491                      return 0;
1492                } else {
1493                   int ha = z->img_comp[n].ha;
1494                   if (!rjpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1495                      return 0;
1496                }
1497 
1498                /* every data block is an MCU, so countdown the restart interval */
1499                if (--z->todo <= 0)
1500                {
1501                   if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1502                   if (!RJPEG__RESTART(z->marker)) return 1;
1503                   rjpeg__jpeg_reset(z);
1504                }
1505             }
1506          }
1507       }
1508       else
1509       {
1510          /* interleaved */
1511          int i,j,k,x,y;
1512 
1513          for (j=0; j < z->img_mcu_y; ++j)
1514          {
1515             for (i=0; i < z->img_mcu_x; ++i)
1516             {
1517                /* scan an interleaved MCU... process scan_n components in order */
1518                for (k=0; k < z->scan_n; ++k)
1519                {
1520                   int n = z->order[k];
1521                   /* scan out an MCU's worth of this component; that's just determined
1522                    * by the basic H and V specified for the component */
1523                   for (y=0; y < z->img_comp[n].v; ++y)
1524                   {
1525                      for (x=0; x < z->img_comp[n].h; ++x)
1526                      {
1527                         int x2 = (i*z->img_comp[n].h + x);
1528                         int y2 = (j*z->img_comp[n].v + y);
1529                         short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1530                         if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1531                            return 0;
1532                      }
1533                   }
1534                }
1535                /* after all interleaved components, that's an interleaved MCU,
1536                 * so now count down the restart interval */
1537                if (--z->todo <= 0)
1538                {
1539                   if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1540                   if (!RJPEG__RESTART(z->marker)) return 1;
1541                   rjpeg__jpeg_reset(z);
1542                }
1543             }
1544          }
1545       }
1546       return 1;
1547    }
1548 }
1549 
rjpeg__jpeg_dequantize(short * data,uint8_t * dequant)1550 static void rjpeg__jpeg_dequantize(short *data, uint8_t *dequant)
1551 {
1552    int i;
1553    for (i=0; i < 64; ++i)
1554       data[i] *= dequant[i];
1555 }
1556 
rjpeg__jpeg_finish(rjpeg__jpeg * z)1557 static void rjpeg__jpeg_finish(rjpeg__jpeg *z)
1558 {
1559    if (z->progressive)
1560    {
1561       /* dequantize and IDCT the data */
1562       int i,j,n;
1563       for (n=0; n < z->s->img_n; ++n)
1564       {
1565          int w = (z->img_comp[n].x+7) >> 3;
1566          int h = (z->img_comp[n].y+7) >> 3;
1567          for (j=0; j < h; ++j)
1568          {
1569             for (i=0; i < w; ++i)
1570             {
1571                short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1572                rjpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1573                z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
1574             }
1575          }
1576       }
1577    }
1578 }
1579 
rjpeg__process_marker(rjpeg__jpeg * z,int m)1580 static int rjpeg__process_marker(rjpeg__jpeg *z, int m)
1581 {
1582    int L;
1583    switch (m)
1584    {
1585       case RJPEG__MARKER_none: /* no marker found */
1586          return rjpeg__err("expected marker","Corrupt JPEG");
1587 
1588       case 0xDD: /* DRI - specify restart interval */
1589          if (rjpeg__get16be(z->s) != 4) return rjpeg__err("bad DRI len","Corrupt JPEG");
1590          z->restart_interval = rjpeg__get16be(z->s);
1591          return 1;
1592 
1593       case 0xDB: /* DQT - define quantization table */
1594          L = rjpeg__get16be(z->s)-2;
1595          while (L > 0)
1596          {
1597             int q = rjpeg__get8(z->s);
1598             int p = q >> 4;
1599             int t = q & 15,i;
1600             if (p != 0)
1601                return rjpeg__err("bad DQT type","Corrupt JPEG");
1602             if (t > 3)
1603                return rjpeg__err("bad DQT table","Corrupt JPEG");
1604             for (i=0; i < 64; ++i)
1605                z->dequant[t][rjpeg__jpeg_dezigzag[i]] = rjpeg__get8(z->s);
1606             L -= 65;
1607          }
1608          return L==0;
1609 
1610       case 0xC4: /* DHT - define huffman table */
1611          L = rjpeg__get16be(z->s)-2;
1612          while (L > 0)
1613          {
1614             int sizes[16],i,n=0;
1615             uint8_t *v = NULL;
1616             int q      = rjpeg__get8(z->s);
1617             int tc     = q >> 4;
1618             int th     = q & 15;
1619             if (tc > 1 || th > 3)
1620                return rjpeg__err("bad DHT header","Corrupt JPEG");
1621 
1622             for (i=0; i < 16; ++i)
1623             {
1624                sizes[i] = rjpeg__get8(z->s);
1625                n += sizes[i];
1626             }
1627             L -= 17;
1628 
1629             if (tc == 0)
1630             {
1631                if (!rjpeg__build_huffman(z->huff_dc+th, sizes))
1632                   return 0;
1633                v = z->huff_dc[th].values;
1634             }
1635             else
1636             {
1637                if (!rjpeg__build_huffman(z->huff_ac+th, sizes))
1638                   return 0;
1639                v = z->huff_ac[th].values;
1640             }
1641             for (i=0; i < n; ++i)
1642                v[i] = rjpeg__get8(z->s);
1643             if (tc != 0)
1644                rjpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1645             L -= n;
1646          }
1647          return L==0;
1648    }
1649 
1650    /* check for comment block or APP blocks */
1651    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1652    {
1653       rjpeg__skip(z->s, rjpeg__get16be(z->s)-2);
1654       return 1;
1655    }
1656    return 0;
1657 }
1658 
1659 /* after we see SOS */
rjpeg__process_scan_header(rjpeg__jpeg * z)1660 static int rjpeg__process_scan_header(rjpeg__jpeg *z)
1661 {
1662    int i;
1663    int Ls    = rjpeg__get16be(z->s);
1664 
1665    z->scan_n = rjpeg__get8(z->s);
1666 
1667    if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1668       return rjpeg__err("bad SOS component count","Corrupt JPEG");
1669    if (Ls != 6+2*z->scan_n)
1670       return rjpeg__err("bad SOS len","Corrupt JPEG");
1671 
1672    for (i=0; i < z->scan_n; ++i)
1673    {
1674       int id = rjpeg__get8(z->s), which;
1675       int q  = rjpeg__get8(z->s);
1676 
1677       for (which = 0; which < z->s->img_n; ++which)
1678          if (z->img_comp[which].id == id)
1679             break;
1680       if (which == z->s->img_n)
1681          return 0; /* no match */
1682 
1683       z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3)
1684          return rjpeg__err("bad DC huff","Corrupt JPEG");
1685       z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3)
1686          return rjpeg__err("bad AC huff","Corrupt JPEG");
1687       z->order[i] = which;
1688    }
1689 
1690    {
1691       int aa;
1692       z->spec_start = rjpeg__get8(z->s);
1693       z->spec_end   = rjpeg__get8(z->s); /* should be 63, but might be 0 */
1694       aa = rjpeg__get8(z->s);
1695       z->succ_high = (aa >> 4);
1696       z->succ_low  = (aa & 15);
1697       if (z->progressive)
1698       {
1699          if (  z->spec_start > 63 ||
1700                z->spec_end > 63   ||
1701                z->spec_start > z->spec_end ||
1702                z->succ_high > 13           ||
1703                z->succ_low > 13)
1704             return rjpeg__err("bad SOS", "Corrupt JPEG");
1705       }
1706       else
1707       {
1708          if (z->spec_start != 0)
1709             return rjpeg__err("bad SOS","Corrupt JPEG");
1710          if (z->succ_high != 0 || z->succ_low != 0)
1711             return rjpeg__err("bad SOS","Corrupt JPEG");
1712          z->spec_end = 63;
1713       }
1714    }
1715 
1716    return 1;
1717 }
1718 
rjpeg__process_frame_header(rjpeg__jpeg * z,int scan)1719 static int rjpeg__process_frame_header(rjpeg__jpeg *z, int scan)
1720 {
1721    rjpeg__context *s = z->s;
1722    int Lf,p,i,q, h_max=1,v_max=1,c;
1723    Lf = rjpeg__get16be(s);
1724 
1725    /* JPEG */
1726    if (Lf < 11)
1727       return rjpeg__err("bad SOF len","Corrupt JPEG");
1728 
1729    p  = rjpeg__get8(s);
1730 
1731    /* JPEG baseline */
1732    if (p != 8)
1733       return rjpeg__err("only 8-bit","JPEG format not supported: 8-bit only");
1734 
1735    s->img_y = rjpeg__get16be(s);
1736 
1737    /* Legal, but we don't handle it--but neither does IJG */
1738    if (s->img_y == 0)
1739       return rjpeg__err("no header height", "JPEG format not supported: delayed height");
1740 
1741    s->img_x = rjpeg__get16be(s);
1742 
1743    if (s->img_x == 0)
1744       return rjpeg__err("0 width","Corrupt JPEG"); /* JPEG requires */
1745 
1746    c = rjpeg__get8(s);
1747 
1748    /* JFIF requires */
1749    if (c != 3 && c != 1)
1750       return rjpeg__err("bad component count","Corrupt JPEG");
1751 
1752    s->img_n = c;
1753 
1754    for (i=0; i < c; ++i)
1755    {
1756       z->img_comp[i].data = NULL;
1757       z->img_comp[i].linebuf = NULL;
1758    }
1759 
1760    if (Lf != 8+3*s->img_n)
1761       return rjpeg__err("bad SOF len","Corrupt JPEG");
1762 
1763    for (i=0; i < s->img_n; ++i)
1764    {
1765       z->img_comp[i].id = rjpeg__get8(s);
1766       if (z->img_comp[i].id != i+1)   /* JFIF requires */
1767          if (z->img_comp[i].id != i)  /* some version of jpegtran outputs non-JFIF-compliant files! */
1768             return rjpeg__err("bad component ID","Corrupt JPEG");
1769       q = rjpeg__get8(s);
1770       z->img_comp[i].h = (q >> 4);
1771       if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1772          return rjpeg__err("bad H","Corrupt JPEG");
1773       z->img_comp[i].v = q & 15;
1774       if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1775          return rjpeg__err("bad V","Corrupt JPEG");
1776       z->img_comp[i].tq = rjpeg__get8(s);
1777       if (z->img_comp[i].tq > 3)
1778          return rjpeg__err("bad TQ","Corrupt JPEG");
1779    }
1780 
1781    if (scan != RJPEG_SCAN_LOAD) return 1;
1782 
1783    if ((1 << 30) / s->img_x / s->img_n < s->img_y) return rjpeg__err("too large", "Image too large to decode");
1784 
1785    for (i=0; i < s->img_n; ++i)
1786    {
1787       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
1788       if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
1789    }
1790 
1791    /* compute interleaved MCU info */
1792    z->img_h_max = h_max;
1793    z->img_v_max = v_max;
1794    z->img_mcu_w = h_max * 8;
1795    z->img_mcu_h = v_max * 8;
1796    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1797    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1798 
1799    for (i=0; i < s->img_n; ++i)
1800    {
1801       /* number of effective pixels (e.g. for non-interleaved MCU) */
1802       z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1803       z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1804       /* to simplify generation, we'll allocate enough memory to decode
1805        * the bogus oversized data from using interleaved MCUs and their
1806        * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1807        * discard the extra data until colorspace conversion */
1808       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1809       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1810       z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1811 
1812       if (z->img_comp[i].raw_data == NULL)
1813       {
1814          for(--i; i >= 0; --i)
1815          {
1816             free(z->img_comp[i].raw_data);
1817             z->img_comp[i].data = NULL;
1818          }
1819          return rjpeg__err("outofmem", "Out of memory");
1820       }
1821 
1822       /* align blocks for IDCT using MMX/SSE */
1823       z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1824       z->img_comp[i].linebuf = NULL;
1825       if (z->progressive)
1826       {
1827          z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1828          z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1829          z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1830          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1831       } else {
1832          z->img_comp[i].coeff = 0;
1833          z->img_comp[i].raw_coeff = 0;
1834       }
1835    }
1836 
1837    return 1;
1838 }
1839 
1840 
rjpeg__decode_jpeg_header(rjpeg__jpeg * z,int scan)1841 static int rjpeg__decode_jpeg_header(rjpeg__jpeg *z, int scan)
1842 {
1843    int m;
1844    z->marker = RJPEG__MARKER_none; /* initialize cached marker to empty */
1845    m = rjpeg__get_marker(z);
1846 
1847    if (!rjpeg__SOI(m))
1848       return rjpeg__err("no SOI","Corrupt JPEG");
1849 
1850    if (scan == RJPEG_SCAN_TYPE)
1851       return 1;
1852 
1853    m = rjpeg__get_marker(z);
1854    while (!rjpeg__SOF(m))
1855    {
1856       if (!rjpeg__process_marker(z,m))
1857          return 0;
1858       m = rjpeg__get_marker(z);
1859       while (m == RJPEG__MARKER_none)
1860       {
1861          /* some files have extra padding after their blocks, so ok, we'll scan */
1862          if (rjpeg__at_eof(z->s))
1863             return rjpeg__err("no SOF", "Corrupt JPEG");
1864          m = rjpeg__get_marker(z);
1865       }
1866    }
1867    z->progressive = rjpeg__SOF_progressive(m);
1868    if (!rjpeg__process_frame_header(z, scan)) return 0;
1869    return 1;
1870 }
1871 
1872 /* decode image to YCbCr format */
rjpeg__decode_jpeg_image(rjpeg__jpeg * j)1873 static int rjpeg__decode_jpeg_image(rjpeg__jpeg *j)
1874 {
1875    int m;
1876    for (m = 0; m < 4; m++)
1877    {
1878       j->img_comp[m].raw_data = NULL;
1879       j->img_comp[m].raw_coeff = NULL;
1880    }
1881    j->restart_interval = 0;
1882    if (!rjpeg__decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1883       return 0;
1884    m = rjpeg__get_marker(j);
1885 
1886    while (!rjpeg__EOI(m))
1887    {
1888       if (rjpeg__SOS(m))
1889       {
1890          if (!rjpeg__process_scan_header(j))
1891             return 0;
1892          if (!rjpeg__parse_entropy_coded_data(j))
1893             return 0;
1894 
1895          if (j->marker == RJPEG__MARKER_none )
1896          {
1897             /* handle 0s at the end of image data from IP Kamera 9060 */
1898             while (!rjpeg__at_eof(j->s))
1899             {
1900                int x = rjpeg__get8(j->s);
1901                if (x == 255)
1902                {
1903                   j->marker = rjpeg__get8(j->s);
1904                   break;
1905                }
1906                else if (x != 0)
1907                   return rjpeg__err("junk before marker", "Corrupt JPEG");
1908             }
1909             /* if we reach eof without hitting a marker, rjpeg__get_marker() below will fail and we'll eventually return 0 */
1910          }
1911       }
1912       else
1913       {
1914          if (!rjpeg__process_marker(j, m))
1915             return 0;
1916       }
1917       m = rjpeg__get_marker(j);
1918    }
1919 
1920    if (j->progressive)
1921       rjpeg__jpeg_finish(j);
1922    return 1;
1923 }
1924 
1925 /* static jfif-centered resampling (across block boundaries) */
1926 
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1927 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1928 {
1929    (void)out;
1930    (void)in_far;
1931    (void)w;
1932    (void)hs;
1933    return in_near;
1934 }
1935 
rjpeg__resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1936 static uint8_t* rjpeg__resample_row_v_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1937 {
1938    /* need to generate two samples vertically for every one in input */
1939    int i;
1940    (void)hs;
1941    for (i=0; i < w; ++i)
1942       out[i] = rjpeg__div4(3*in_near[i] + in_far[i] + 2);
1943    return out;
1944 }
1945 
rjpeg__resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1946 static uint8_t*  rjpeg__resample_row_h_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1947 {
1948    /* need to generate two samples horizontally for every one in input */
1949    int i;
1950    uint8_t *input = in_near;
1951 
1952    if (w == 1)
1953    {
1954       /* if only one sample, can't do any interpolation */
1955       out[0] = out[1] = input[0];
1956       return out;
1957    }
1958 
1959    out[0] = input[0];
1960    out[1] = rjpeg__div4(input[0]*3 + input[1] + 2);
1961 
1962    for (i=1; i < w-1; ++i)
1963    {
1964       int n = 3*input[i]+2;
1965       out[i*2+0] = rjpeg__div4(n+input[i-1]);
1966       out[i*2+1] = rjpeg__div4(n+input[i+1]);
1967    }
1968    out[i*2+0] = rjpeg__div4(input[w-2]*3 + input[w-1] + 2);
1969    out[i*2+1] = input[w-1];
1970 
1971    (void)in_far;
1972    (void)hs;
1973 
1974    return out;
1975 }
1976 
1977 
rjpeg__resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1978 static uint8_t *rjpeg__resample_row_hv_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1979 {
1980    /* need to generate 2x2 samples for every one in input */
1981    int i,t0,t1;
1982    if (w == 1)
1983    {
1984       out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
1985       return out;
1986    }
1987 
1988    t1     = 3*in_near[0] + in_far[0];
1989    out[0] = rjpeg__div4(t1+2);
1990    for (i=1; i < w; ++i)
1991    {
1992       t0 = t1;
1993       t1 = 3*in_near[i]+in_far[i];
1994       out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
1995       out[i*2  ] = rjpeg__div16(3*t1 + t0 + 8);
1996    }
1997    out[w*2-1] = rjpeg__div4(t1+2);
1998 
1999    (void)hs;
2000 
2001    return out;
2002 }
2003 
2004 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2005 static uint8_t *rjpeg__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
2006 {
2007    /* need to generate 2x2 samples for every one in input */
2008    int i=0,t0,t1;
2009 
2010    if (w == 1)
2011    {
2012       out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2013       return out;
2014    }
2015 
2016    t1 = 3*in_near[0] + in_far[0];
2017    /* process groups of 8 pixels for as long as we can.
2018     * note we can't handle the last pixel in a row in this loop
2019     * because we need to handle the filter boundary conditions.
2020     */
2021    for (; i < ((w-1) & ~7); i += 8)
2022    {
2023 #if defined(__SSE2__)
2024       /* load and perform the vertical filtering pass
2025        * this uses 3*x + y = 4*x + (y - x) */
2026       __m128i zero  = _mm_setzero_si128();
2027       __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
2028       __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2029       __m128i farw  = _mm_unpacklo_epi8(farb, zero);
2030       __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2031       __m128i diff  = _mm_sub_epi16(farw, nearw);
2032       __m128i nears = _mm_slli_epi16(nearw, 2);
2033       __m128i curr  = _mm_add_epi16(nears, diff); /* current row */
2034 
2035       /* horizontal filter works the same based on shifted vers of current
2036        * row. "prev" is current row shifted right by 1 pixel; we need to
2037        * insert the previous pixel value (from t1).
2038        * "next" is current row shifted left by 1 pixel, with first pixel
2039        * of next block of 8 pixels added in.
2040        */
2041       __m128i prv0 = _mm_slli_si128(curr, 2);
2042       __m128i nxt0 = _mm_srli_si128(curr, 2);
2043       __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2044       __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2045 
2046       /* horizontal filter, polyphase implementation since it's convenient:
2047        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2048        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2049        * note the shared term. */
2050       __m128i bias = _mm_set1_epi16(8);
2051       __m128i curs = _mm_slli_epi16(curr, 2);
2052       __m128i prvd = _mm_sub_epi16(prev, curr);
2053       __m128i nxtd = _mm_sub_epi16(next, curr);
2054       __m128i curb = _mm_add_epi16(curs, bias);
2055       __m128i even = _mm_add_epi16(prvd, curb);
2056       __m128i odd  = _mm_add_epi16(nxtd, curb);
2057 
2058       /* interleave even and odd pixels, then undo scaling. */
2059       __m128i int0 = _mm_unpacklo_epi16(even, odd);
2060       __m128i int1 = _mm_unpackhi_epi16(even, odd);
2061       __m128i de0  = _mm_srli_epi16(int0, 4);
2062       __m128i de1  = _mm_srli_epi16(int1, 4);
2063 
2064       /* pack and write output */
2065       __m128i outv = _mm_packus_epi16(de0, de1);
2066       _mm_storeu_si128((__m128i *) (out + i*2), outv);
2067 #elif defined(RJPEG_NEON)
2068       /* load and perform the vertical filtering pass
2069        * this uses 3*x + y = 4*x + (y - x) */
2070       uint8x8_t farb  = vld1_u8(in_far + i);
2071       uint8x8_t nearb = vld1_u8(in_near + i);
2072       int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2073       int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2074       int16x8_t curr  = vaddq_s16(nears, diff); /* current row */
2075 
2076       /* horizontal filter works the same based on shifted vers of current
2077        * row. "prev" is current row shifted right by 1 pixel; we need to
2078        * insert the previous pixel value (from t1).
2079        * "next" is current row shifted left by 1 pixel, with first pixel
2080        * of next block of 8 pixels added in. */
2081       int16x8_t prv0 = vextq_s16(curr, curr, 7);
2082       int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2083       int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2084       int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2085 
2086       /* horizontal filter, polyphase implementation since it's convenient:
2087        * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2088        * odd  pixels = 3*cur + next = cur*4 + (next - cur)
2089        * note the shared term.
2090        */
2091       int16x8_t curs = vshlq_n_s16(curr, 2);
2092       int16x8_t prvd = vsubq_s16(prev, curr);
2093       int16x8_t nxtd = vsubq_s16(next, curr);
2094       int16x8_t even = vaddq_s16(curs, prvd);
2095       int16x8_t odd  = vaddq_s16(curs, nxtd);
2096 
2097       /* undo scaling and round, then store with even/odd phases interleaved */
2098       uint8x8x2_t o;
2099       o.val[0] = vqrshrun_n_s16(even, 4);
2100       o.val[1] = vqrshrun_n_s16(odd,  4);
2101       vst2_u8(out + i*2, o);
2102 #endif
2103 
2104       /* "previous" value for next iteration */
2105       t1 = 3*in_near[i+7] + in_far[i+7];
2106    }
2107 
2108    t0       = t1;
2109    t1       = 3*in_near[i] + in_far[i];
2110    out[i*2] = rjpeg__div16(3*t1 + t0 + 8);
2111 
2112    for (++i; i < w; ++i)
2113    {
2114       t0         = t1;
2115       t1         = 3*in_near[i]+in_far[i];
2116       out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2117       out[i*2  ] = rjpeg__div16(3*t1 + t0 + 8);
2118    }
2119    out[w*2-1] = rjpeg__div4(t1+2);
2120 
2121    (void)hs;
2122 
2123    return out;
2124 }
2125 #endif
2126 
rjpeg__resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2127 static uint8_t *rjpeg__resample_row_generic(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
2128 {
2129    /* resample with nearest-neighbor */
2130    int i,j;
2131    (void)in_far;
2132 
2133    for (i=0; i < w; ++i)
2134       for (j=0; j < hs; ++j)
2135          out[i*hs+j] = in_near[i];
2136    return out;
2137 }
2138 
2139 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2140  * to make sure the code produces the same results in both SIMD and scalar */
2141 #ifndef float2fixed
2142 #define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
2143 #endif
2144 
rjpeg__YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2145 static void rjpeg__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2146 {
2147    int i;
2148    for (i=0; i < count; ++i)
2149    {
2150       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2151       int cr = pcr[i] - 128;
2152       int cb = pcb[i] - 128;
2153       int r = y_fixed +  cr* float2fixed(1.40200f);
2154       int g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2155       int b = y_fixed                               +   cb* float2fixed(1.77200f);
2156       r >>= 20;
2157       g >>= 20;
2158       b >>= 20;
2159       if ((unsigned) r > 255)
2160          r = 255;
2161       if ((unsigned) g > 255)
2162          g = 255;
2163       if ((unsigned) b > 255)
2164          b = 255;
2165       out[0] = (uint8_t)r;
2166       out[1] = (uint8_t)g;
2167       out[2] = (uint8_t)b;
2168       out[3] = 255;
2169       out += step;
2170    }
2171 }
2172 
2173 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2174 static void rjpeg__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2175 {
2176    int i = 0;
2177 
2178 #if defined(__SSE2__)
2179    /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2180     * it's useful in practice (you wouldn't use it for textures, for example).
2181     * so just accelerate step == 4 case.
2182     */
2183    if (step == 4)
2184    {
2185       /* this is a fairly straightforward implementation and not super-optimized. */
2186       __m128i signflip  = _mm_set1_epi8(-0x80);
2187       __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
2188       __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2189       __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2190       __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
2191       __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2192       __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2193 
2194       for (; i+7 < count; i += 8)
2195       {
2196          /* load */
2197          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2198          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2199          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2200          __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2201          __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2202 
2203          /* unpack to short (and left-shift cr, cb by 8) */
2204          __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
2205          __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2206          __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2207 
2208          /* color transform */
2209          __m128i yws = _mm_srli_epi16(yw, 4);
2210          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2211          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2212          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2213          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2214          __m128i rws = _mm_add_epi16(cr0, yws);
2215          __m128i gwt = _mm_add_epi16(cb0, yws);
2216          __m128i bws = _mm_add_epi16(yws, cb1);
2217          __m128i gws = _mm_add_epi16(gwt, cr1);
2218 
2219          /* descale */
2220          __m128i rw = _mm_srai_epi16(rws, 4);
2221          __m128i bw = _mm_srai_epi16(bws, 4);
2222          __m128i gw = _mm_srai_epi16(gws, 4);
2223 
2224          /* back to byte, set up for transpose */
2225          __m128i brb = _mm_packus_epi16(rw, bw);
2226          __m128i gxb = _mm_packus_epi16(gw, xw);
2227 
2228          /* transpose to interleave channels */
2229          __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2230          __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2231          __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2232          __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2233 
2234          /* store */
2235          _mm_storeu_si128((__m128i *) (out + 0), o0);
2236          _mm_storeu_si128((__m128i *) (out + 16), o1);
2237          out += 32;
2238       }
2239    }
2240 #endif
2241 
2242 #ifdef RJPEG_NEON
2243    /* in this version, step=3 support would be easy to add. but is there demand? */
2244    if (step == 4)
2245    {
2246       /* this is a fairly straightforward implementation and not super-optimized. */
2247       uint8x8_t signflip = vdup_n_u8(0x80);
2248       int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
2249       int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2250       int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2251       int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
2252 
2253       for (; i+7 < count; i += 8)
2254       {
2255          uint8x8x4_t o;
2256 
2257          /* load */
2258          uint8x8_t y_bytes  = vld1_u8(y + i);
2259          uint8x8_t cr_bytes = vld1_u8(pcr + i);
2260          uint8x8_t cb_bytes = vld1_u8(pcb + i);
2261          int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2262          int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2263 
2264          /* expand to s16 */
2265          int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2266          int16x8_t crw = vshll_n_s8(cr_biased, 7);
2267          int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2268 
2269          /* color transform */
2270          int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2271          int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2272          int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2273          int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2274          int16x8_t rws = vaddq_s16(yws, cr0);
2275          int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2276          int16x8_t bws = vaddq_s16(yws, cb1);
2277 
2278          /* undo scaling, round, convert to byte */
2279          o.val[0] = vqrshrun_n_s16(rws, 4);
2280          o.val[1] = vqrshrun_n_s16(gws, 4);
2281          o.val[2] = vqrshrun_n_s16(bws, 4);
2282          o.val[3] = vdup_n_u8(255);
2283 
2284          /* store, interleaving r/g/b/a */
2285          vst4_u8(out, o);
2286          out += 8*4;
2287       }
2288    }
2289 #endif
2290 
2291    for (; i < count; ++i)
2292    {
2293       int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2294       int cr      = pcr[i] - 128;
2295       int cb      = pcb[i] - 128;
2296       int r       = y_fixed + cr* float2fixed(1.40200f);
2297       int g       = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2298       int b       = y_fixed                             +   cb* float2fixed(1.77200f);
2299       r >>= 20;
2300       g >>= 20;
2301       b >>= 20;
2302       if ((unsigned) r > 255)
2303          r = 255;
2304       if ((unsigned) g > 255)
2305          g = 255;
2306       if ((unsigned) b > 255)
2307          b = 255;
2308       out[0] = (uint8_t)r;
2309       out[1] = (uint8_t)g;
2310       out[2] = (uint8_t)b;
2311       out[3] = 255;
2312       out += step;
2313    }
2314 }
2315 #endif
2316 
2317 /* set up the kernels */
rjpeg__setup_jpeg(rjpeg__jpeg * j)2318 static void rjpeg__setup_jpeg(rjpeg__jpeg *j)
2319 {
2320    uint64_t mask = cpu_features_get();
2321 
2322    (void)mask;
2323 
2324    j->idct_block_kernel        = rjpeg__idct_block;
2325    j->YCbCr_to_RGB_kernel      = rjpeg__YCbCr_to_RGB_row;
2326    j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2;
2327 
2328 
2329 #if defined(__SSE2__)
2330    if (mask & RETRO_SIMD_SSE2)
2331    {
2332       j->idct_block_kernel        = rjpeg__idct_simd;
2333       j->YCbCr_to_RGB_kernel      = rjpeg__YCbCr_to_RGB_simd;
2334       j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2335    }
2336 #endif
2337 
2338 #ifdef RJPEG_NEON
2339    j->idct_block_kernel           = rjpeg__idct_simd;
2340    j->YCbCr_to_RGB_kernel         = rjpeg__YCbCr_to_RGB_simd;
2341    j->resample_row_hv_2_kernel    = rjpeg__resample_row_hv_2_simd;
2342 #endif
2343 }
2344 
2345 /* clean up the temporary component buffers */
rjpeg__cleanup_jpeg(rjpeg__jpeg * j)2346 static void rjpeg__cleanup_jpeg(rjpeg__jpeg *j)
2347 {
2348    int i;
2349    for (i=0; i < j->s->img_n; ++i)
2350    {
2351       if (j->img_comp[i].raw_data)
2352       {
2353          free(j->img_comp[i].raw_data);
2354          j->img_comp[i].raw_data = NULL;
2355          j->img_comp[i].data = NULL;
2356       }
2357 
2358       if (j->img_comp[i].raw_coeff)
2359       {
2360          free(j->img_comp[i].raw_coeff);
2361          j->img_comp[i].raw_coeff = 0;
2362          j->img_comp[i].coeff = 0;
2363       }
2364 
2365       if (j->img_comp[i].linebuf)
2366       {
2367          free(j->img_comp[i].linebuf);
2368          j->img_comp[i].linebuf = NULL;
2369       }
2370    }
2371 }
2372 
rjpeg_load_jpeg_image(rjpeg__jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2373 static uint8_t *rjpeg_load_jpeg_image(rjpeg__jpeg *z, unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2374 {
2375    int n, decode_n;
2376    int k;
2377    unsigned int i,j;
2378    rjpeg__resample res_comp[4];
2379    uint8_t *coutput[4] = {0};
2380    uint8_t *output     = NULL;
2381    z->s->img_n         = 0; /* make rjpeg__cleanup_jpeg safe */
2382 
2383    /* validate req_comp */
2384    if (req_comp < 0 || req_comp > 4)
2385       return rjpeg__errpuc("bad req_comp", "Internal error");
2386 
2387    /* load a jpeg image from whichever source, but leave in YCbCr format */
2388    if (!rjpeg__decode_jpeg_image(z))
2389       goto error;
2390 
2391    /* determine actual number of components to generate */
2392    n = req_comp ? req_comp : z->s->img_n;
2393 
2394    if (z->s->img_n == 3 && n < 3)
2395       decode_n = 1;
2396    else
2397       decode_n = z->s->img_n;
2398 
2399    /* resample and color-convert */
2400    for (k=0; k < decode_n; ++k)
2401    {
2402       rjpeg__resample *r = &res_comp[k];
2403 
2404       /* allocate line buffer big enough for upsampling off the edges
2405        * with upsample factor of 4 */
2406       z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2407       if (!z->img_comp[k].linebuf)
2408          goto error;
2409 
2410       r->hs       = z->img_h_max / z->img_comp[k].h;
2411       r->vs       = z->img_v_max / z->img_comp[k].v;
2412       r->ystep    = r->vs >> 1;
2413       r->w_lores  = (z->s->img_x + r->hs-1) / r->hs;
2414       r->ypos     = 0;
2415       r->line0    = r->line1 = z->img_comp[k].data;
2416       r->resample = rjpeg__resample_row_generic;
2417 
2418       if      (r->hs == 1 && r->vs == 1)
2419          r->resample = rjpeg_resample_row_1;
2420       else if (r->hs == 1 && r->vs == 2)
2421          r->resample = rjpeg__resample_row_v_2;
2422       else if (r->hs == 2 && r->vs == 1)
2423          r->resample = rjpeg__resample_row_h_2;
2424       else if (r->hs == 2 && r->vs == 2)
2425          r->resample = z->resample_row_hv_2_kernel;
2426    }
2427 
2428    /* can't error after this so, this is safe */
2429    output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2430 
2431    if (!output)
2432       goto error;
2433 
2434    /* now go ahead and resample */
2435    for (j=0; j < z->s->img_y; ++j)
2436    {
2437       uint8_t *out = output + n * z->s->img_x * j;
2438       for (k=0; k < decode_n; ++k)
2439       {
2440          rjpeg__resample *r = &res_comp[k];
2441          int         y_bot  = r->ystep >= (r->vs >> 1);
2442 
2443          coutput[k]         = r->resample(z->img_comp[k].linebuf,
2444                y_bot ? r->line1 : r->line0,
2445                y_bot ? r->line0 : r->line1,
2446                r->w_lores, r->hs);
2447 
2448          if (++r->ystep >= r->vs)
2449          {
2450             r->ystep = 0;
2451             r->line0 = r->line1;
2452             if (++r->ypos < z->img_comp[k].y)
2453                r->line1 += z->img_comp[k].w2;
2454          }
2455       }
2456 
2457       if (n >= 3)
2458       {
2459          uint8_t *y = coutput[0];
2460          if (z->s->img_n == 3)
2461             z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2462          else
2463             for (i=0; i < z->s->img_x; ++i)
2464             {
2465                out[0] = out[1] = out[2] = y[i];
2466                out[3] = 255; /* not used if n==3 */
2467                out += n;
2468             }
2469       }
2470       else
2471       {
2472          uint8_t *y = coutput[0];
2473          if (n == 1)
2474             for (i=0; i < z->s->img_x; ++i)
2475                out[i] = y[i];
2476          else
2477             for (i=0; i < z->s->img_x; ++i)
2478                *out++ = y[i], *out++ = 255;
2479       }
2480    }
2481 
2482    rjpeg__cleanup_jpeg(z);
2483    *out_x = z->s->img_x;
2484    *out_y = z->s->img_y;
2485 
2486    if (comp)
2487       *comp  = z->s->img_n; /* report original components, not output */
2488    return output;
2489 
2490 error:
2491    rjpeg__cleanup_jpeg(z);
2492    return NULL;
2493 }
2494 
rjpeg__jpeg_load(rjpeg__context * s,unsigned * x,unsigned * y,int * comp,int req_comp)2495 static unsigned char *rjpeg__jpeg_load(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp)
2496 {
2497    rjpeg__jpeg j;
2498    j.s = s;
2499    rjpeg__setup_jpeg(&j);
2500    return rjpeg_load_jpeg_image(&j, x,y,comp,req_comp);
2501 }
2502 
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2503 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2504       size_t size, unsigned *width, unsigned *height)
2505 {
2506    int comp;
2507    uint32_t *img         = NULL;
2508    uint32_t *pixels      = NULL;
2509    unsigned size_tex     = 0;
2510 
2511    if (!rjpeg)
2512       return IMAGE_PROCESS_ERROR;
2513 
2514    img   = (uint32_t*)rjpeg_load_from_memory(rjpeg->buff_data, size, width, height, &comp, 4);
2515 
2516    if (!img)
2517       return IMAGE_PROCESS_ERROR;
2518 
2519    size_tex = (*width) * (*height);
2520    pixels   = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2521 
2522    if (!pixels)
2523    {
2524       free(img);
2525       return IMAGE_PROCESS_ERROR;
2526    }
2527 
2528    *buf_data = pixels;
2529 
2530    /* Convert RGBA to ARGB */
2531    while (size_tex--)
2532    {
2533       unsigned int texel = img[size_tex];
2534       unsigned int A     = texel & 0xFF000000;
2535       unsigned int B     = texel & 0x00FF0000;
2536       unsigned int G     = texel & 0x0000FF00;
2537       unsigned int R     = texel & 0x000000FF;
2538       ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2539    };
2540 
2541    free(img);
2542 
2543    return IMAGE_PROCESS_END;
2544 }
2545 
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2546 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2547 {
2548    if (!rjpeg)
2549       return false;
2550 
2551    rjpeg->buff_data = (uint8_t*)data;
2552 
2553    return true;
2554 }
2555 
rjpeg_free(rjpeg_t * rjpeg)2556 void rjpeg_free(rjpeg_t *rjpeg)
2557 {
2558    if (!rjpeg)
2559       return;
2560 
2561    free(rjpeg);
2562 }
2563 
rjpeg_alloc(void)2564 rjpeg_t *rjpeg_alloc(void)
2565 {
2566    rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2567    if (!rjpeg)
2568       return NULL;
2569    return rjpeg;
2570 }
2571