1 /* Copyright (C) 2010-2016 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (rjpeg.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 /* Modified version of stb_image's JPEG sources. */
24
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37
38 enum
39 {
40 RJPEG_DEFAULT = 0, /* only used for req_comp */
41 RJPEG_GREY,
42 RJPEG_GREY_ALPHA,
43 RJPEG_RGB,
44 RJPEG_RGB_ALPHA
45 };
46
47 typedef struct
48 {
49 int (*read) (void *user,char *data,int size); /* fill 'data' with 'size' bytes. return number of bytes actually read */
50 void (*skip) (void *user,int n); /* skip the next 'n' bytes, or 'unget' the last -n bytes if negative */
51 int (*eof) (void *user); /* returns nonzero if we are at end of file/data */
52 } rjpeg_io_callbacks;
53
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55 int w, int hs);
56
57 typedef struct
58 {
59 rjpeg_resample_row_func resample;
60 uint8_t *line0,*line1;
61 int hs,vs; /* expansion factor in each axis */
62 int w_lores; /* horizontal pixels pre-expansion */
63 int ystep; /* how far through vertical expansion we are */
64 int ypos; /* which pre-expansion row we're on */
65 } rjpeg__resample;
66
67 struct rjpeg
68 {
69 uint8_t *buff_data;
70 void *empty;
71 };
72
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76
77 #ifdef RJPEG_HAS_LROTL
78 #define rjpeg_lrot(x,y) _lrotl(x,y)
79 #else
80 #define rjpeg_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG__X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG__X86_TARGET
88 #endif
89
90 #if defined(__GNUC__) && (defined(RJPEG__X86_TARGET) || defined(RJPEG__X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92 * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93 * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94 * this is just broken and gcc are jerks for not fixing it properly
95 * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96 */
97 #define RJPEG_NO_SIMD
98 #endif
99
100 #if defined(__MINGW32__) && defined(RJPEG__X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG__X64_TARGET
102 *
103 * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104 * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105 * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106 * simultaneously enabling "-mstackrealign".
107 *
108 * See https://github.com/nothings/stb/issues/81 for more information.
109 *
110 * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111 * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112 */
113 #define RJPEG_NO_SIMD
114 #endif
115
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124
125 #endif
126
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141
142 typedef struct
143 {
144 uint32_t img_x, img_y;
145 int img_n, img_out_n;
146
147 rjpeg_io_callbacks io;
148 void *io_user_data;
149
150 int read_from_callbacks;
151 int buflen;
152 uint8_t buffer_start[128];
153
154 uint8_t *img_buffer, *img_buffer_end;
155 uint8_t *img_buffer_original;
156 } rjpeg__context;
157
158 static uint8_t *rjpeg__jpeg_load(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp);
159
160 #define rjpeg__err(x,y) 0
161
162 #define rjpeg__errpf(x,y) ((float *) (rjpeg__err(x,y)?NULL:NULL))
163 #define rjpeg__errpuc(x,y) ((unsigned char *) (rjpeg__err(x,y)?NULL:NULL))
164
165 static int rjpeg__vertically_flip_on_load = 0;
166
rjpeg__load_flip(rjpeg__context * s,unsigned * x,unsigned * y,int * comp,int req_comp)167 static unsigned char *rjpeg__load_flip(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp)
168 {
169 unsigned char *result = rjpeg__jpeg_load(s,x,y,comp,req_comp);
170
171 if (rjpeg__vertically_flip_on_load && result != NULL)
172 {
173 int row,col,z;
174 int w = *x, h = *y;
175 int depth = req_comp ? req_comp : *comp;
176
177 for (row = 0; row < (h>>1); row++)
178 {
179 for (col = 0; col < w; col++)
180 {
181 for (z = 0; z < depth; z++)
182 {
183 uint8_t temp = result[(row * w + col) * depth + z];
184 result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
185 result[((h - row - 1) * w + col) * depth + z] = temp;
186 }
187 }
188 }
189 }
190
191 return result;
192 }
193
rjpeg_load_from_memory(const uint8_t * buffer,int len,unsigned * x,unsigned * y,int * comp,int req_comp)194 static uint8_t *rjpeg_load_from_memory(const uint8_t *buffer, int len, unsigned *x, unsigned *y, int *comp, int req_comp)
195 {
196 rjpeg__context s;
197 s.io.read = NULL;
198 s.read_from_callbacks = 0;
199 s.img_buffer = s.img_buffer_original = (uint8_t *) buffer;
200 s.img_buffer_end = (uint8_t *) buffer+len;
201 return rjpeg__load_flip(&s,x,y,comp,req_comp);
202 }
203
204 enum
205 {
206 RJPEG_SCAN_LOAD = 0,
207 RJPEG_SCAN_TYPE,
208 RJPEG_SCAN_HEADER
209 };
210
rjpeg__refill_buffer(rjpeg__context * s)211 static void rjpeg__refill_buffer(rjpeg__context *s)
212 {
213 int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
214
215 if (n == 0)
216 {
217 /* at end of file, treat same as if from memory, but need to handle case
218 * where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file */
219 s->read_from_callbacks = 0;
220 s->img_buffer = s->buffer_start;
221 s->img_buffer_end = s->buffer_start+1;
222 *s->img_buffer = 0;
223 }
224 else
225 {
226 s->img_buffer = s->buffer_start;
227 s->img_buffer_end = s->buffer_start + n;
228 }
229 }
230
rjpeg__get8(rjpeg__context * s)231 static INLINE uint8_t rjpeg__get8(rjpeg__context *s)
232 {
233 if (s->img_buffer < s->img_buffer_end)
234 return *s->img_buffer++;
235
236 if (s->read_from_callbacks)
237 {
238 rjpeg__refill_buffer(s);
239 return *s->img_buffer++;
240 }
241
242 return 0;
243 }
244
rjpeg__at_eof(rjpeg__context * s)245 static INLINE int rjpeg__at_eof(rjpeg__context *s)
246 {
247 if (s->io.read)
248 {
249 if (!(s->io.eof)(s->io_user_data))
250 return 0;
251
252 /* if feof() is true, check if buffer = end
253 * special case: we've only got the special
254 * 0 character at the end */
255
256 if (s->read_from_callbacks == 0)
257 return 1;
258 }
259
260 return s->img_buffer >= s->img_buffer_end;
261 }
262
rjpeg__skip(rjpeg__context * s,int n)263 static void rjpeg__skip(rjpeg__context *s, int n)
264 {
265 if (n < 0)
266 {
267 s->img_buffer = s->img_buffer_end;
268 return;
269 }
270
271 if (s->io.read)
272 {
273 int blen = (int) (s->img_buffer_end - s->img_buffer);
274
275 if (blen < n)
276 {
277 s->img_buffer = s->img_buffer_end;
278 (s->io.skip)(s->io_user_data, n - blen);
279 return;
280 }
281 }
282 s->img_buffer += n;
283 }
284
rjpeg__get16be(rjpeg__context * s)285 static int rjpeg__get16be(rjpeg__context *s)
286 {
287 int z = rjpeg__get8(s);
288 return (z << 8) + rjpeg__get8(s);
289 }
290
291 #define RJPEG__BYTECAST(x) ((uint8_t) ((x) & 255)) /* truncate int to byte without warnings */
292
293 /* huffman decoding acceleration */
294 #define FAST_BITS 9 /* larger handles more cases; smaller stomps less cache */
295
296 typedef struct
297 {
298 uint8_t fast[1 << FAST_BITS];
299 /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
300 uint16_t code[256];
301 uint8_t values[256];
302 uint8_t size[257];
303 unsigned int maxcode[18];
304 int delta[17]; /* old 'firstsymbol' - old 'firstcode' */
305 } rjpeg__huffman;
306
307 typedef struct
308 {
309 rjpeg__context *s;
310 rjpeg__huffman huff_dc[4];
311 rjpeg__huffman huff_ac[4];
312 uint8_t dequant[4][64];
313 int16_t fast_ac[4][1 << FAST_BITS];
314
315 /* sizes for components, interleaved MCUs */
316 int img_h_max, img_v_max;
317 int img_mcu_x, img_mcu_y;
318 int img_mcu_w, img_mcu_h;
319
320 /* definition of jpeg image component */
321 struct
322 {
323 int id;
324 int h,v;
325 int tq;
326 int hd,ha;
327 int dc_pred;
328
329 int x,y,w2,h2;
330 uint8_t *data;
331 void *raw_data, *raw_coeff;
332 uint8_t *linebuf;
333 short *coeff; /* progressive only */
334 int coeff_w, coeff_h; /* number of 8x8 coefficient blocks */
335 } img_comp[4];
336
337 uint32_t code_buffer; /* jpeg entropy-coded buffer */
338 int code_bits; /* number of valid bits */
339 unsigned char marker; /* marker seen while filling entropy buffer */
340 int nomore; /* flag if we saw a marker so must stop */
341
342 int progressive;
343 int spec_start;
344 int spec_end;
345 int succ_high;
346 int succ_low;
347 int eob_run;
348
349 int scan_n, order[4];
350 int restart_interval, todo;
351
352 /* kernels */
353 void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
354 void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step);
355 uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs);
356 } rjpeg__jpeg;
357
358 #define rjpeg__f2f(x) ((int) (((x) * 4096 + 0.5)))
359 #define rjpeg__fsh(x) ((x) << 12)
360
361 #define RJPEG__MARKER_none 0xff
362 /* if there's a pending marker from the entropy stream, return that
363 * otherwise, fetch from the stream and get a marker. if there's no
364 * marker, return 0xff, which is never a valid marker value
365 */
366
367 /* in each scan, we'll have scan_n components, and the order
368 * of the components is specified by order[]
369 */
370 #define RJPEG__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
371
372 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
373 #define rjpeg__SOI(x) ((x) == 0xd8)
374 #define rjpeg__EOI(x) ((x) == 0xd9)
375 #define rjpeg__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
376 #define rjpeg__SOS(x) ((x) == 0xda)
377
378 #define rjpeg__SOF_progressive(x) ((x) == 0xc2)
379 #define rjpeg__div4(x) ((uint8_t) ((x) >> 2))
380 #define rjpeg__div16(x) ((uint8_t) ((x) >> 4))
381
rjpeg__build_huffman(rjpeg__huffman * h,int * count)382 static int rjpeg__build_huffman(rjpeg__huffman *h, int *count)
383 {
384 int i,j,k=0,code;
385
386 /* build size list for each symbol (from JPEG spec) */
387 for (i=0; i < 16; ++i)
388 for (j=0; j < count[i]; ++j)
389 h->size[k++] = (uint8_t) (i+1);
390
391 h->size[k] = 0;
392 /* compute actual symbols (from jpeg spec) */
393 code = 0;
394 k = 0;
395
396 for(j=1; j <= 16; ++j)
397 {
398 /* compute delta to add to code to compute symbol id */
399 h->delta[j] = k - code;
400 if (h->size[k] == j)
401 {
402 while (h->size[k] == j)
403 h->code[k++] = (uint16_t) (code++);
404 if (code-1 >= (1 << j))
405 return rjpeg__err("bad code lengths","Corrupt JPEG");
406 }
407 /* compute largest code + 1 for this size, preshifted as needed later */
408 h->maxcode[j] = code << (16-j);
409 code <<= 1;
410 }
411 h->maxcode[j] = 0xffffffff;
412
413 /* build non-spec acceleration table; 255 is flag for not-accelerated */
414 memset(h->fast, 255, 1 << FAST_BITS);
415 for (i=0; i < k; ++i)
416 {
417 int s = h->size[i];
418 if (s <= FAST_BITS)
419 {
420 int c = h->code[i] << (FAST_BITS-s);
421 int m = 1 << (FAST_BITS-s);
422 for (j=0; j < m; ++j)
423 h->fast[c+j] = (uint8_t) i;
424 }
425 }
426 return 1;
427 }
428
429 /* build a table that decodes both magnitude and value of small ACs in
430 * one go. */
rjpeg__build_fast_ac(int16_t * fast_ac,rjpeg__huffman * h)431 static void rjpeg__build_fast_ac(int16_t *fast_ac, rjpeg__huffman *h)
432 {
433 int i;
434
435 for (i=0; i < (1 << FAST_BITS); ++i)
436 {
437 uint8_t fast = h->fast[i];
438
439 fast_ac[i] = 0;
440
441 if (fast < 255)
442 {
443 int rs = h->values[fast];
444 int run = (rs >> 4) & 15;
445 int magbits = rs & 15;
446 int len = h->size[fast];
447
448 if (magbits && len + magbits <= FAST_BITS)
449 {
450 /* magnitude code followed by receive_extend code */
451 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
452 int m = 1 << (magbits - 1);
453 if (k < m)
454 k += (-1 << magbits) + 1;
455
456 /* if the result is small enough, we can fit it in fast_ac table */
457 if (k >= -128 && k <= 127)
458 fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
459 }
460 }
461 }
462 }
463
rjpeg__grow_buffer_unsafe(rjpeg__jpeg * j)464 static void rjpeg__grow_buffer_unsafe(rjpeg__jpeg *j)
465 {
466 do
467 {
468 int b = j->nomore ? 0 : rjpeg__get8(j->s);
469 if (b == 0xff)
470 {
471 int c = rjpeg__get8(j->s);
472
473 if (c != 0)
474 {
475 j->marker = (unsigned char) c;
476 j->nomore = 1;
477 return;
478 }
479 }
480 j->code_buffer |= b << (24 - j->code_bits);
481 j->code_bits += 8;
482 } while (j->code_bits <= 24);
483 }
484
485 /* (1 << n) - 1 */
486 static uint32_t rjpeg__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
487
488 /* decode a JPEG huffman value from the bitstream */
rjpeg__jpeg_huff_decode(rjpeg__jpeg * j,rjpeg__huffman * h)489 static INLINE int rjpeg__jpeg_huff_decode(rjpeg__jpeg *j, rjpeg__huffman *h)
490 {
491 unsigned int temp;
492 int c,k;
493
494 if (j->code_bits < 16)
495 rjpeg__grow_buffer_unsafe(j);
496
497 /* look at the top FAST_BITS and determine what symbol ID it is,
498 * if the code is <= FAST_BITS */
499 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
500 k = h->fast[c];
501
502 if (k < 255)
503 {
504 int s = h->size[k];
505 if (s > j->code_bits)
506 return -1;
507 j->code_buffer <<= s;
508 j->code_bits -= s;
509 return h->values[k];
510 }
511
512 /* naive test is to shift the code_buffer down so k bits are
513 * valid, then test against maxcode. To speed this up, we've
514 * preshifted maxcode left so that it has (16-k) 0s at the
515 * end; in other words, regardless of the number of bits, it
516 * wants to be compared against something shifted to have 16;
517 * that way we don't need to shift inside the loop. */
518 temp = j->code_buffer >> 16;
519 for (k=FAST_BITS+1 ; ; ++k)
520 if (temp < h->maxcode[k])
521 break;
522
523 if (k == 17)
524 {
525 /* error! code not found */
526 j->code_bits -= 16;
527 return -1;
528 }
529
530 if (k > j->code_bits)
531 return -1;
532
533 /* convert the huffman code to the symbol id */
534 c = ((j->code_buffer >> (32 - k)) & rjpeg__bmask[k]) + h->delta[k];
535 assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg__bmask[h->size[c]]) == h->code[c]);
536
537 /* convert the id to a symbol */
538 j->code_bits -= k;
539 j->code_buffer <<= k;
540 return h->values[c];
541 }
542
543 /* bias[n] = (-1<<n) + 1 */
544 static int const rjpeg__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
545
546 /* combined JPEG 'receive' and JPEG 'extend', since baseline
547 * always extends everything it receives. */
rjpeg__extend_receive(rjpeg__jpeg * j,int n)548 static INLINE int rjpeg__extend_receive(rjpeg__jpeg *j, int n)
549 {
550 unsigned int k;
551 int sgn;
552 if (j->code_bits < n)
553 rjpeg__grow_buffer_unsafe(j);
554
555 sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
556 k = rjpeg_lrot(j->code_buffer, n);
557 assert(n >= 0 && n < (int) (sizeof(rjpeg__bmask)/sizeof(*rjpeg__bmask)));
558 j->code_buffer = k & ~rjpeg__bmask[n];
559 k &= rjpeg__bmask[n];
560 j->code_bits -= n;
561 return k + (rjpeg__jbias[n] & ~sgn);
562 }
563
564 /* get some unsigned bits */
rjpeg__jpeg_get_bits(rjpeg__jpeg * j,int n)565 static INLINE int rjpeg__jpeg_get_bits(rjpeg__jpeg *j, int n)
566 {
567 unsigned int k;
568 if (j->code_bits < n) rjpeg__grow_buffer_unsafe(j);
569 k = rjpeg_lrot(j->code_buffer, n);
570 j->code_buffer = k & ~rjpeg__bmask[n];
571 k &= rjpeg__bmask[n];
572 j->code_bits -= n;
573 return k;
574 }
575
rjpeg__jpeg_get_bit(rjpeg__jpeg * j)576 static INLINE int rjpeg__jpeg_get_bit(rjpeg__jpeg *j)
577 {
578 unsigned int k;
579 if (j->code_bits < 1) rjpeg__grow_buffer_unsafe(j);
580 k = j->code_buffer;
581 j->code_buffer <<= 1;
582 --j->code_bits;
583 return k & 0x80000000;
584 }
585
586 /* given a value that's at position X in the zigzag stream,
587 * where does it appear in the 8x8 matrix coded as row-major? */
588 static uint8_t rjpeg__jpeg_dezigzag[64+15] =
589 {
590 0, 1, 8, 16, 9, 2, 3, 10,
591 17, 24, 32, 25, 18, 11, 4, 5,
592 12, 19, 26, 33, 40, 48, 41, 34,
593 27, 20, 13, 6, 7, 14, 21, 28,
594 35, 42, 49, 56, 57, 50, 43, 36,
595 29, 22, 15, 23, 30, 37, 44, 51,
596 58, 59, 52, 45, 38, 31, 39, 46,
597 53, 60, 61, 54, 47, 55, 62, 63,
598 /* let corrupt input sample past end */
599 63, 63, 63, 63, 63, 63, 63, 63,
600 63, 63, 63, 63, 63, 63, 63
601 };
602
603 /* decode one 64-entry block-- */
rjpeg__jpeg_decode_block(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,rjpeg__huffman * hac,int16_t * fac,int b,uint8_t * dequant)604 static int rjpeg__jpeg_decode_block(
605 rjpeg__jpeg *j, short data[64],
606 rjpeg__huffman *hdc,
607 rjpeg__huffman *hac,
608 int16_t *fac,
609 int b,
610 uint8_t *dequant)
611 {
612 int diff,dc,k;
613 int t;
614
615 if (j->code_bits < 16)
616 rjpeg__grow_buffer_unsafe(j);
617 t = rjpeg__jpeg_huff_decode(j, hdc);
618 if (t < 0)
619 return rjpeg__err("bad huffman code","Corrupt JPEG");
620
621 /* 0 all the ac values now so we can do it 32-bits at a time */
622 memset(data,0,64*sizeof(data[0]));
623
624 diff = t ? rjpeg__extend_receive(j, t) : 0;
625 dc = j->img_comp[b].dc_pred + diff;
626 j->img_comp[b].dc_pred = dc;
627 data[0] = (short) (dc * dequant[0]);
628
629 /* decode AC components, see JPEG spec */
630 k = 1;
631 do
632 {
633 unsigned int zig;
634 int c,r,s;
635 if (j->code_bits < 16)
636 rjpeg__grow_buffer_unsafe(j);
637 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
638 r = fac[c];
639 if (r)
640 {
641 /* fast-AC path */
642 k += (r >> 4) & 15; /* run */
643 s = r & 15; /* combined length */
644 j->code_buffer <<= s;
645 j->code_bits -= s;
646 /* decode into unzigzag'd location */
647 zig = rjpeg__jpeg_dezigzag[k++];
648 data[zig] = (short) ((r >> 8) * dequant[zig]);
649 }
650 else
651 {
652 int rs = rjpeg__jpeg_huff_decode(j, hac);
653 if (rs < 0)
654 return rjpeg__err("bad huffman code","Corrupt JPEG");
655 s = rs & 15;
656 r = rs >> 4;
657 if (s == 0)
658 {
659 if (rs != 0xf0)
660 break; /* end block */
661 k += 16;
662 }
663 else
664 {
665 k += r;
666 /* decode into unzigzag'd location */
667 zig = rjpeg__jpeg_dezigzag[k++];
668 data[zig] = (short) (rjpeg__extend_receive(j,s) * dequant[zig]);
669 }
670 }
671 } while (k < 64);
672 return 1;
673 }
674
rjpeg__jpeg_decode_block_prog_dc(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,int b)675 static int rjpeg__jpeg_decode_block_prog_dc(
676 rjpeg__jpeg *j,
677 short data[64],
678 rjpeg__huffman *hdc,
679 int b)
680 {
681 if (j->spec_end != 0)
682 return rjpeg__err("can't merge dc and ac", "Corrupt JPEG");
683
684 if (j->code_bits < 16)
685 rjpeg__grow_buffer_unsafe(j);
686
687 if (j->succ_high == 0)
688 {
689 int t;
690 int diff,dc;
691
692 /* first scan for DC coefficient, must be first */
693 memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
694 t = rjpeg__jpeg_huff_decode(j, hdc);
695 diff = t ? rjpeg__extend_receive(j, t) : 0;
696
697 dc = j->img_comp[b].dc_pred + diff;
698 j->img_comp[b].dc_pred = dc;
699 data[0] = (short) (dc << j->succ_low);
700 }
701 else
702 {
703 /* refinement scan for DC coefficient */
704 if (rjpeg__jpeg_get_bit(j))
705 data[0] += (short) (1 << j->succ_low);
706 }
707 return 1;
708 }
709
rjpeg__jpeg_decode_block_prog_ac(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hac,int16_t * fac)710 static int rjpeg__jpeg_decode_block_prog_ac(
711 rjpeg__jpeg *j,
712 short data[64],
713 rjpeg__huffman *hac,
714 int16_t *fac)
715 {
716 int k;
717 if (j->spec_start == 0)
718 return rjpeg__err("can't merge dc and ac", "Corrupt JPEG");
719
720 if (j->succ_high == 0)
721 {
722 int shift = j->succ_low;
723
724 if (j->eob_run)
725 {
726 --j->eob_run;
727 return 1;
728 }
729
730 k = j->spec_start;
731 do {
732 unsigned int zig;
733 int c,r,s;
734 if (j->code_bits < 16) rjpeg__grow_buffer_unsafe(j);
735 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
736 r = fac[c];
737 if (r)
738 { /* fast-AC path */
739 k += (r >> 4) & 15; /* run */
740 s = r & 15; /* combined length */
741 j->code_buffer <<= s;
742 j->code_bits -= s;
743 zig = rjpeg__jpeg_dezigzag[k++];
744 data[zig] = (short) ((r >> 8) << shift);
745 }
746 else
747 {
748 int rs = rjpeg__jpeg_huff_decode(j, hac);
749 if (rs < 0) return rjpeg__err("bad huffman code","Corrupt JPEG");
750 s = rs & 15;
751 r = rs >> 4;
752 if (s == 0)
753 {
754 if (r < 15)
755 {
756 j->eob_run = (1 << r);
757 if (r)
758 j->eob_run += rjpeg__jpeg_get_bits(j, r);
759 --j->eob_run;
760 break;
761 }
762 k += 16;
763 } else {
764 k += r;
765 zig = rjpeg__jpeg_dezigzag[k++];
766 data[zig] = (short) (rjpeg__extend_receive(j,s) << shift);
767 }
768 }
769 } while (k <= j->spec_end);
770 } else {
771 /* refinement scan for these AC coefficients */
772
773 short bit = (short) (1 << j->succ_low);
774
775 if (j->eob_run)
776 {
777 --j->eob_run;
778 for (k = j->spec_start; k <= j->spec_end; ++k)
779 {
780 short *p = &data[rjpeg__jpeg_dezigzag[k]];
781 if (*p != 0)
782 if (rjpeg__jpeg_get_bit(j))
783 if ((*p & bit)==0)
784 {
785 if (*p > 0)
786 *p += bit;
787 else
788 *p -= bit;
789 }
790 }
791 } else {
792 k = j->spec_start;
793 do {
794 int r,s;
795 int rs = rjpeg__jpeg_huff_decode(j, hac);
796 if (rs < 0) return rjpeg__err("bad huffman code","Corrupt JPEG");
797 s = rs & 15;
798 r = rs >> 4;
799 if (s == 0)
800 {
801 if (r < 15)
802 {
803 j->eob_run = (1 << r) - 1;
804 if (r)
805 j->eob_run += rjpeg__jpeg_get_bits(j, r);
806 r = 64; /* force end of block */
807 } else {
808 /* r=15 s=0 should write 16 0s, so we just do
809 * a run of 15 0s and then write s (which is 0),
810 * so we don't have to do anything special here */
811 }
812 } else {
813 if (s != 1) return rjpeg__err("bad huffman code", "Corrupt JPEG");
814 /* sign bit */
815 if (rjpeg__jpeg_get_bit(j))
816 s = bit;
817 else
818 s = -bit;
819 }
820
821 /* advance by r */
822 while (k <= j->spec_end)
823 {
824 short *p = &data[rjpeg__jpeg_dezigzag[k++]];
825 if (*p != 0)
826 {
827 if (rjpeg__jpeg_get_bit(j))
828 if ((*p & bit)==0)
829 {
830 if (*p > 0)
831 *p += bit;
832 else
833 *p -= bit;
834 }
835 }
836 else
837 {
838 if (r == 0)
839 {
840 *p = (short) s;
841 break;
842 }
843 --r;
844 }
845 }
846 } while (k <= j->spec_end);
847 }
848 }
849 return 1;
850 }
851
852 /* take a -128..127 value and rjpeg__clamp it and convert to 0..255 */
rjpeg__clamp(int x)853 static INLINE uint8_t rjpeg__clamp(int x)
854 {
855 /* trick to use a single test to catch both cases */
856 if ((unsigned int) x > 255)
857 return 255;
858 return (uint8_t) x;
859 }
860
861
862 /* derived from jidctint -- DCT_ISLOW */
863 #define RJPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
864 int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
865 p2 = s2; \
866 p3 = s6; \
867 p1 = (p2+p3) * rjpeg__f2f(0.5411961f); \
868 t2 = p1 + p3*rjpeg__f2f(-1.847759065f); \
869 t3 = p1 + p2*rjpeg__f2f( 0.765366865f); \
870 p2 = s0; \
871 p3 = s4; \
872 t0 = rjpeg__fsh(p2+p3); \
873 t1 = rjpeg__fsh(p2-p3); \
874 x0 = t0+t3; \
875 x3 = t0-t3; \
876 x1 = t1+t2; \
877 x2 = t1-t2; \
878 t0 = s7; \
879 t1 = s5; \
880 t2 = s3; \
881 t3 = s1; \
882 p3 = t0+t2; \
883 p4 = t1+t3; \
884 p1 = t0+t3; \
885 p2 = t1+t2; \
886 p5 = (p3+p4)*rjpeg__f2f( 1.175875602f); \
887 t0 = t0*rjpeg__f2f( 0.298631336f); \
888 t1 = t1*rjpeg__f2f( 2.053119869f); \
889 t2 = t2*rjpeg__f2f( 3.072711026f); \
890 t3 = t3*rjpeg__f2f( 1.501321110f); \
891 p1 = p5 + p1*rjpeg__f2f(-0.899976223f); \
892 p2 = p5 + p2*rjpeg__f2f(-2.562915447f); \
893 p3 = p3*rjpeg__f2f(-1.961570560f); \
894 p4 = p4*rjpeg__f2f(-0.390180644f); \
895 t3 += p1+p4; \
896 t2 += p2+p3; \
897 t1 += p2+p4; \
898 t0 += p1+p3;
899
rjpeg__idct_block(uint8_t * out,int out_stride,short data[64])900 static void rjpeg__idct_block(uint8_t *out, int out_stride, short data[64])
901 {
902 int i,val[64],*v=val;
903 uint8_t *o = NULL;
904 int16_t *d = data;
905
906 /* columns */
907 for (i=0; i < 8; ++i,++d, ++v)
908 {
909 /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
910 if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
911 && d[40]==0 && d[48]==0 && d[56]==0)
912 {
913 /* no shortcut 0 seconds
914 * (1|2|3|4|5|6|7)==0 0 seconds
915 * all separate -0.047 seconds
916 * 1 && 2|3 && 4|5 && 6|7: -0.047 seconds */
917 int dcterm = d[0] << 2;
918 v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
919 }
920 else
921 {
922 RJPEG__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
923 /* constants scaled things up by 1<<12; let's bring them back
924 * down, but keep 2 extra bits of precision */
925 x0 += 512; x1 += 512; x2 += 512; x3 += 512;
926 v[ 0] = (x0+t3) >> 10;
927 v[56] = (x0-t3) >> 10;
928 v[ 8] = (x1+t2) >> 10;
929 v[48] = (x1-t2) >> 10;
930 v[16] = (x2+t1) >> 10;
931 v[40] = (x2-t1) >> 10;
932 v[24] = (x3+t0) >> 10;
933 v[32] = (x3-t0) >> 10;
934 }
935 }
936
937 for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
938 {
939 /* no fast case since the first 1D IDCT spread components out */
940 RJPEG__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
941 /* constants scaled things up by 1<<12, plus we had 1<<2 from first
942 * loop, plus horizontal and vertical each scale by sqrt(8) so together
943 * we've got an extra 1<<3, so 1<<17 total we need to remove.
944 * so we want to round that, which means adding 0.5 * 1<<17,
945 * aka 65536. Also, we'll end up with -128 to 127 that we want
946 * to encode as 0..255 by adding 128, so we'll add that before the shift */
947 x0 += 65536 + (128<<17);
948 x1 += 65536 + (128<<17);
949 x2 += 65536 + (128<<17);
950 x3 += 65536 + (128<<17);
951 /* tried computing the shifts into temps, or'ing the temps to see
952 * if any were out of range, but that was slower */
953 o[0] = rjpeg__clamp((x0+t3) >> 17);
954 o[7] = rjpeg__clamp((x0-t3) >> 17);
955 o[1] = rjpeg__clamp((x1+t2) >> 17);
956 o[6] = rjpeg__clamp((x1-t2) >> 17);
957 o[2] = rjpeg__clamp((x2+t1) >> 17);
958 o[5] = rjpeg__clamp((x2-t1) >> 17);
959 o[3] = rjpeg__clamp((x3+t0) >> 17);
960 o[4] = rjpeg__clamp((x3-t0) >> 17);
961 }
962 }
963
964 #if defined(__SSE2__)
965 /* sse2 integer IDCT. not the fastest possible implementation but it
966 * produces bit-identical results to the generic C version so it's
967 * fully "transparent".
968 */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])969 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
970 {
971 /* This is constructed to match our regular (generic) integer IDCT exactly. */
972 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
973 __m128i tmp;
974
975 /* dot product constant: even elems=x, odd elems=y */
976 #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
977
978 /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
979 * out(1) = c1[even]*x + c1[odd]*y
980 */
981 #define dct_rot(out0,out1, x,y,c0,c1) \
982 __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
983 __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
984 __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
985 __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
986 __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
987 __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
988
989 /* out = in << 12 (in 16-bit, out 32-bit) */
990 #define dct_widen(out, in) \
991 __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
992 __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
993
994 /* wide add */
995 #define dct_wadd(out, a, b) \
996 __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
997 __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
998
999 /* wide sub */
1000 #define dct_wsub(out, a, b) \
1001 __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
1002 __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
1003
1004 /* butterfly a/b, add bias, then shift by "s" and pack */
1005 #define dct_bfly32o(out0, out1, a,b,bias,s) \
1006 { \
1007 __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
1008 __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
1009 dct_wadd(sum, abiased, b); \
1010 dct_wsub(dif, abiased, b); \
1011 out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
1012 out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
1013 }
1014
1015 /* 8-bit interleave step (for transposes) */
1016 #define dct_interleave8(a, b) \
1017 tmp = a; \
1018 a = _mm_unpacklo_epi8(a, b); \
1019 b = _mm_unpackhi_epi8(tmp, b)
1020
1021 /* 16-bit interleave step (for transposes) */
1022 #define dct_interleave16(a, b) \
1023 tmp = a; \
1024 a = _mm_unpacklo_epi16(a, b); \
1025 b = _mm_unpackhi_epi16(tmp, b)
1026
1027 #define dct_pass(bias,shift) \
1028 { \
1029 /* even part */ \
1030 dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
1031 __m128i sum04 = _mm_add_epi16(row0, row4); \
1032 __m128i dif04 = _mm_sub_epi16(row0, row4); \
1033 dct_widen(t0e, sum04); \
1034 dct_widen(t1e, dif04); \
1035 dct_wadd(x0, t0e, t3e); \
1036 dct_wsub(x3, t0e, t3e); \
1037 dct_wadd(x1, t1e, t2e); \
1038 dct_wsub(x2, t1e, t2e); \
1039 /* odd part */ \
1040 dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
1041 dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
1042 __m128i sum17 = _mm_add_epi16(row1, row7); \
1043 __m128i sum35 = _mm_add_epi16(row3, row5); \
1044 dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
1045 dct_wadd(x4, y0o, y4o); \
1046 dct_wadd(x5, y1o, y5o); \
1047 dct_wadd(x6, y2o, y5o); \
1048 dct_wadd(x7, y3o, y4o); \
1049 dct_bfly32o(row0,row7, x0,x7,bias,shift); \
1050 dct_bfly32o(row1,row6, x1,x6,bias,shift); \
1051 dct_bfly32o(row2,row5, x2,x5,bias,shift); \
1052 dct_bfly32o(row3,row4, x3,x4,bias,shift); \
1053 }
1054
1055 __m128i rot0_0 = dct_const(rjpeg__f2f(0.5411961f), rjpeg__f2f(0.5411961f) + rjpeg__f2f(-1.847759065f));
1056 __m128i rot0_1 = dct_const(rjpeg__f2f(0.5411961f) + rjpeg__f2f( 0.765366865f), rjpeg__f2f(0.5411961f));
1057 __m128i rot1_0 = dct_const(rjpeg__f2f(1.175875602f) + rjpeg__f2f(-0.899976223f), rjpeg__f2f(1.175875602f));
1058 __m128i rot1_1 = dct_const(rjpeg__f2f(1.175875602f), rjpeg__f2f(1.175875602f) + rjpeg__f2f(-2.562915447f));
1059 __m128i rot2_0 = dct_const(rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 0.298631336f), rjpeg__f2f(-1.961570560f));
1060 __m128i rot2_1 = dct_const(rjpeg__f2f(-1.961570560f), rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 3.072711026f));
1061 __m128i rot3_0 = dct_const(rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 2.053119869f), rjpeg__f2f(-0.390180644f));
1062 __m128i rot3_1 = dct_const(rjpeg__f2f(-0.390180644f), rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 1.501321110f));
1063
1064 /* rounding biases in column/row passes, see rjpeg__idct_block for explanation. */
1065 __m128i bias_0 = _mm_set1_epi32(512);
1066 __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
1067
1068 /* load */
1069 row0 = _mm_load_si128((const __m128i *) (data + 0*8));
1070 row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1071 row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1072 row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1073 row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1074 row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1075 row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1076 row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1077
1078 /* column pass */
1079 dct_pass(bias_0, 10);
1080
1081 {
1082 /* 16bit 8x8 transpose pass 1 */
1083 dct_interleave16(row0, row4);
1084 dct_interleave16(row1, row5);
1085 dct_interleave16(row2, row6);
1086 dct_interleave16(row3, row7);
1087
1088 /* transpose pass 2 */
1089 dct_interleave16(row0, row2);
1090 dct_interleave16(row1, row3);
1091 dct_interleave16(row4, row6);
1092 dct_interleave16(row5, row7);
1093
1094 /* transpose pass 3 */
1095 dct_interleave16(row0, row1);
1096 dct_interleave16(row2, row3);
1097 dct_interleave16(row4, row5);
1098 dct_interleave16(row6, row7);
1099 }
1100
1101 /* row pass */
1102 dct_pass(bias_1, 17);
1103
1104 {
1105 /* pack */
1106 __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1107 __m128i p1 = _mm_packus_epi16(row2, row3);
1108 __m128i p2 = _mm_packus_epi16(row4, row5);
1109 __m128i p3 = _mm_packus_epi16(row6, row7);
1110
1111 /* 8bit 8x8 transpose pass 1 */
1112 dct_interleave8(p0, p2); /* a0e0a1e1... */
1113 dct_interleave8(p1, p3); /* c0g0c1g1... */
1114
1115 /* transpose pass 2 */
1116 dct_interleave8(p0, p1); /* a0c0e0g0... */
1117 dct_interleave8(p2, p3); /* b0d0f0h0... */
1118
1119 /* transpose pass 3 */
1120 dct_interleave8(p0, p2); /* a0b0c0d0... */
1121 dct_interleave8(p1, p3); /* a4b4c4d4... */
1122
1123 /* store */
1124 _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1125 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1126 _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1127 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1128 _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1129 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1130 _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1131 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1132 }
1133
1134 #undef dct_const
1135 #undef dct_rot
1136 #undef dct_widen
1137 #undef dct_wadd
1138 #undef dct_wsub
1139 #undef dct_bfly32o
1140 #undef dct_interleave8
1141 #undef dct_interleave16
1142 #undef dct_pass
1143 }
1144
1145 #endif
1146
1147 #ifdef RJPEG_NEON
1148
1149 /* NEON integer IDCT. should produce bit-identical
1150 * results to the generic C version. */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])1151 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
1152 {
1153 int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1154
1155 int16x4_t rot0_0 = vdup_n_s16(rjpeg__f2f(0.5411961f));
1156 int16x4_t rot0_1 = vdup_n_s16(rjpeg__f2f(-1.847759065f));
1157 int16x4_t rot0_2 = vdup_n_s16(rjpeg__f2f( 0.765366865f));
1158 int16x4_t rot1_0 = vdup_n_s16(rjpeg__f2f( 1.175875602f));
1159 int16x4_t rot1_1 = vdup_n_s16(rjpeg__f2f(-0.899976223f));
1160 int16x4_t rot1_2 = vdup_n_s16(rjpeg__f2f(-2.562915447f));
1161 int16x4_t rot2_0 = vdup_n_s16(rjpeg__f2f(-1.961570560f));
1162 int16x4_t rot2_1 = vdup_n_s16(rjpeg__f2f(-0.390180644f));
1163 int16x4_t rot3_0 = vdup_n_s16(rjpeg__f2f( 0.298631336f));
1164 int16x4_t rot3_1 = vdup_n_s16(rjpeg__f2f( 2.053119869f));
1165 int16x4_t rot3_2 = vdup_n_s16(rjpeg__f2f( 3.072711026f));
1166 int16x4_t rot3_3 = vdup_n_s16(rjpeg__f2f( 1.501321110f));
1167
1168 #define dct_long_mul(out, inq, coeff) \
1169 int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1170 int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1171
1172 #define dct_long_mac(out, acc, inq, coeff) \
1173 int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1174 int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1175
1176 #define dct_widen(out, inq) \
1177 int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1178 int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1179
1180 /* wide add */
1181 #define dct_wadd(out, a, b) \
1182 int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1183 int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1184
1185 /* wide sub */
1186 #define dct_wsub(out, a, b) \
1187 int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1188 int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1189
1190 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1191 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1192 { \
1193 dct_wadd(sum, a, b); \
1194 dct_wsub(dif, a, b); \
1195 out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1196 out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1197 }
1198
1199 #define dct_pass(shiftop, shift) \
1200 { \
1201 /* even part */ \
1202 int16x8_t sum26 = vaddq_s16(row2, row6); \
1203 dct_long_mul(p1e, sum26, rot0_0); \
1204 dct_long_mac(t2e, p1e, row6, rot0_1); \
1205 dct_long_mac(t3e, p1e, row2, rot0_2); \
1206 int16x8_t sum04 = vaddq_s16(row0, row4); \
1207 int16x8_t dif04 = vsubq_s16(row0, row4); \
1208 dct_widen(t0e, sum04); \
1209 dct_widen(t1e, dif04); \
1210 dct_wadd(x0, t0e, t3e); \
1211 dct_wsub(x3, t0e, t3e); \
1212 dct_wadd(x1, t1e, t2e); \
1213 dct_wsub(x2, t1e, t2e); \
1214 /* odd part */ \
1215 int16x8_t sum15 = vaddq_s16(row1, row5); \
1216 int16x8_t sum17 = vaddq_s16(row1, row7); \
1217 int16x8_t sum35 = vaddq_s16(row3, row5); \
1218 int16x8_t sum37 = vaddq_s16(row3, row7); \
1219 int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1220 dct_long_mul(p5o, sumodd, rot1_0); \
1221 dct_long_mac(p1o, p5o, sum17, rot1_1); \
1222 dct_long_mac(p2o, p5o, sum35, rot1_2); \
1223 dct_long_mul(p3o, sum37, rot2_0); \
1224 dct_long_mul(p4o, sum15, rot2_1); \
1225 dct_wadd(sump13o, p1o, p3o); \
1226 dct_wadd(sump24o, p2o, p4o); \
1227 dct_wadd(sump23o, p2o, p3o); \
1228 dct_wadd(sump14o, p1o, p4o); \
1229 dct_long_mac(x4, sump13o, row7, rot3_0); \
1230 dct_long_mac(x5, sump24o, row5, rot3_1); \
1231 dct_long_mac(x6, sump23o, row3, rot3_2); \
1232 dct_long_mac(x7, sump14o, row1, rot3_3); \
1233 dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1234 dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1235 dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1236 dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1237 }
1238
1239 /* load */
1240 row0 = vld1q_s16(data + 0*8);
1241 row1 = vld1q_s16(data + 1*8);
1242 row2 = vld1q_s16(data + 2*8);
1243 row3 = vld1q_s16(data + 3*8);
1244 row4 = vld1q_s16(data + 4*8);
1245 row5 = vld1q_s16(data + 5*8);
1246 row6 = vld1q_s16(data + 6*8);
1247 row7 = vld1q_s16(data + 7*8);
1248
1249 /* add DC bias */
1250 row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1251
1252 /* column pass */
1253 dct_pass(vrshrn_n_s32, 10);
1254
1255 /* 16bit 8x8 transpose */
1256 {
1257 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1258 * whether compilers actually get this is another story, sadly. */
1259 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1260 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1261 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1262
1263 /* pass 1 */
1264 dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1265 dct_trn16(row2, row3);
1266 dct_trn16(row4, row5);
1267 dct_trn16(row6, row7);
1268
1269 /* pass 2 */
1270 dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1271 dct_trn32(row1, row3);
1272 dct_trn32(row4, row6);
1273 dct_trn32(row5, row7);
1274
1275 /* pass 3 */
1276 dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1277 dct_trn64(row1, row5);
1278 dct_trn64(row2, row6);
1279 dct_trn64(row3, row7);
1280
1281 #undef dct_trn16
1282 #undef dct_trn32
1283 #undef dct_trn64
1284 }
1285
1286 /* row pass
1287 * vrshrn_n_s32 only supports shifts up to 16, we need
1288 * 17. so do a non-rounding shift of 16 first then follow
1289 * up with a rounding shift by 1. */
1290 dct_pass(vshrn_n_s32, 16);
1291
1292 {
1293 /* pack and round */
1294 uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1295 uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1296 uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1297 uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1298 uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1299 uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1300 uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1301 uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1302
1303 /* again, these can translate into one instruction, but often don't. */
1304 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1305 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1306 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1307
1308 /* sadly can't use interleaved stores here since we only write
1309 * 8 bytes to each scan line! */
1310
1311 /* 8x8 8-bit transpose pass 1 */
1312 dct_trn8_8(p0, p1);
1313 dct_trn8_8(p2, p3);
1314 dct_trn8_8(p4, p5);
1315 dct_trn8_8(p6, p7);
1316
1317 /* pass 2 */
1318 dct_trn8_16(p0, p2);
1319 dct_trn8_16(p1, p3);
1320 dct_trn8_16(p4, p6);
1321 dct_trn8_16(p5, p7);
1322
1323 /* pass 3 */
1324 dct_trn8_32(p0, p4);
1325 dct_trn8_32(p1, p5);
1326 dct_trn8_32(p2, p6);
1327 dct_trn8_32(p3, p7);
1328
1329 /* store */
1330 vst1_u8(out, p0); out += out_stride;
1331 vst1_u8(out, p1); out += out_stride;
1332 vst1_u8(out, p2); out += out_stride;
1333 vst1_u8(out, p3); out += out_stride;
1334 vst1_u8(out, p4); out += out_stride;
1335 vst1_u8(out, p5); out += out_stride;
1336 vst1_u8(out, p6); out += out_stride;
1337 vst1_u8(out, p7);
1338
1339 #undef dct_trn8_8
1340 #undef dct_trn8_16
1341 #undef dct_trn8_32
1342 }
1343
1344 #undef dct_long_mul
1345 #undef dct_long_mac
1346 #undef dct_widen
1347 #undef dct_wadd
1348 #undef dct_wsub
1349 #undef dct_bfly32o
1350 #undef dct_pass
1351 }
1352
1353 #endif /* RJPEG_NEON */
1354
rjpeg__get_marker(rjpeg__jpeg * j)1355 static uint8_t rjpeg__get_marker(rjpeg__jpeg *j)
1356 {
1357 uint8_t x;
1358 if (j->marker != RJPEG__MARKER_none)
1359 {
1360 x = j->marker;
1361 j->marker = RJPEG__MARKER_none;
1362 return x;
1363 }
1364
1365 x = rjpeg__get8(j->s);
1366 if (x != 0xff)
1367 return RJPEG__MARKER_none;
1368 while (x == 0xff)
1369 x = rjpeg__get8(j->s);
1370 return x;
1371 }
1372
1373
1374 /* after a restart interval, rjpeg__jpeg_reset the entropy decoder and
1375 * the dc prediction
1376 */
rjpeg__jpeg_reset(rjpeg__jpeg * j)1377 static void rjpeg__jpeg_reset(rjpeg__jpeg *j)
1378 {
1379 j->code_bits = 0;
1380 j->code_buffer = 0;
1381 j->nomore = 0;
1382 j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
1383 j->marker = RJPEG__MARKER_none;
1384 j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1385 j->eob_run = 0;
1386 /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1387 * since we don't even allow 1<<30 pixels */
1388 }
1389
rjpeg__parse_entropy_coded_data(rjpeg__jpeg * z)1390 static int rjpeg__parse_entropy_coded_data(rjpeg__jpeg *z)
1391 {
1392 rjpeg__jpeg_reset(z);
1393 if (!z->progressive)
1394 {
1395 if (z->scan_n == 1)
1396 {
1397 int i,j;
1398 RJPEG_SIMD_ALIGN(short, data[64]);
1399 int n = z->order[0];
1400 /* non-interleaved data, we just need to process one block at a time,
1401 * in trivial scanline order
1402 * number of blocks to do just depends on how many actual "pixels" this
1403 * component has, independent of interleaved MCU blocking and such */
1404 int w = (z->img_comp[n].x+7) >> 3;
1405 int h = (z->img_comp[n].y+7) >> 3;
1406
1407 for (j=0; j < h; ++j)
1408 {
1409 for (i=0; i < w; ++i)
1410 {
1411 int ha = z->img_comp[n].ha;
1412 if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
1413 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
1414 /* every data block is an MCU, so countdown the restart interval */
1415 if (--z->todo <= 0)
1416 {
1417 if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1418 /* if it's NOT a restart, then just bail,
1419 * so we get corrupt data rather than no data */
1420 if (!RJPEG__RESTART(z->marker)) return 1;
1421 rjpeg__jpeg_reset(z);
1422 }
1423 }
1424 }
1425 }
1426 else
1427 {
1428 /* interleaved */
1429 int i,j,k,x,y;
1430 RJPEG_SIMD_ALIGN(short, data[64]);
1431 for (j=0; j < z->img_mcu_y; ++j)
1432 {
1433 for (i=0; i < z->img_mcu_x; ++i)
1434 {
1435 /* scan an interleaved mcu...
1436 * process scan_n components in order */
1437 for (k=0; k < z->scan_n; ++k)
1438 {
1439 int n = z->order[k];
1440 /* scan out an mcu's worth of this component;
1441 * that's just determined by the basic H
1442 * and V specified for the component */
1443 for (y=0; y < z->img_comp[n].v; ++y)
1444 {
1445 for (x=0; x < z->img_comp[n].h; ++x)
1446 {
1447 int x2 = (i*z->img_comp[n].h + x)*8;
1448 int y2 = (j*z->img_comp[n].v + y)*8;
1449 int ha = z->img_comp[n].ha;
1450 if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1451 return 0;
1452 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
1453 }
1454 }
1455 }
1456 /* after all interleaved components, that's an interleaved MCU,
1457 * so now count down the restart interval */
1458 if (--z->todo <= 0)
1459 {
1460 if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1461 if (!RJPEG__RESTART(z->marker)) return 1;
1462 rjpeg__jpeg_reset(z);
1463 }
1464 }
1465 }
1466 }
1467 return 1;
1468 }
1469 else
1470 {
1471 if (z->scan_n == 1)
1472 {
1473 int i,j;
1474 int n = z->order[0];
1475 int w = (z->img_comp[n].x+7) >> 3;
1476 int h = (z->img_comp[n].y+7) >> 3;
1477
1478 /* non-interleaved data, we just need to process one block at a time,
1479 * in trivial scanline order
1480 * number of blocks to do just depends on how many actual "pixels" this
1481 * component has, independent of interleaved MCU blocking and such */
1482
1483 for (j=0; j < h; ++j)
1484 {
1485 for (i=0; i < w; ++i)
1486 {
1487 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1488 if (z->spec_start == 0)
1489 {
1490 if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1491 return 0;
1492 } else {
1493 int ha = z->img_comp[n].ha;
1494 if (!rjpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1495 return 0;
1496 }
1497
1498 /* every data block is an MCU, so countdown the restart interval */
1499 if (--z->todo <= 0)
1500 {
1501 if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1502 if (!RJPEG__RESTART(z->marker)) return 1;
1503 rjpeg__jpeg_reset(z);
1504 }
1505 }
1506 }
1507 }
1508 else
1509 {
1510 /* interleaved */
1511 int i,j,k,x,y;
1512
1513 for (j=0; j < z->img_mcu_y; ++j)
1514 {
1515 for (i=0; i < z->img_mcu_x; ++i)
1516 {
1517 /* scan an interleaved MCU... process scan_n components in order */
1518 for (k=0; k < z->scan_n; ++k)
1519 {
1520 int n = z->order[k];
1521 /* scan out an MCU's worth of this component; that's just determined
1522 * by the basic H and V specified for the component */
1523 for (y=0; y < z->img_comp[n].v; ++y)
1524 {
1525 for (x=0; x < z->img_comp[n].h; ++x)
1526 {
1527 int x2 = (i*z->img_comp[n].h + x);
1528 int y2 = (j*z->img_comp[n].v + y);
1529 short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1530 if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1531 return 0;
1532 }
1533 }
1534 }
1535 /* after all interleaved components, that's an interleaved MCU,
1536 * so now count down the restart interval */
1537 if (--z->todo <= 0)
1538 {
1539 if (z->code_bits < 24) rjpeg__grow_buffer_unsafe(z);
1540 if (!RJPEG__RESTART(z->marker)) return 1;
1541 rjpeg__jpeg_reset(z);
1542 }
1543 }
1544 }
1545 }
1546 return 1;
1547 }
1548 }
1549
rjpeg__jpeg_dequantize(short * data,uint8_t * dequant)1550 static void rjpeg__jpeg_dequantize(short *data, uint8_t *dequant)
1551 {
1552 int i;
1553 for (i=0; i < 64; ++i)
1554 data[i] *= dequant[i];
1555 }
1556
rjpeg__jpeg_finish(rjpeg__jpeg * z)1557 static void rjpeg__jpeg_finish(rjpeg__jpeg *z)
1558 {
1559 if (z->progressive)
1560 {
1561 /* dequantize and IDCT the data */
1562 int i,j,n;
1563 for (n=0; n < z->s->img_n; ++n)
1564 {
1565 int w = (z->img_comp[n].x+7) >> 3;
1566 int h = (z->img_comp[n].y+7) >> 3;
1567 for (j=0; j < h; ++j)
1568 {
1569 for (i=0; i < w; ++i)
1570 {
1571 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1572 rjpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1573 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
1574 }
1575 }
1576 }
1577 }
1578 }
1579
rjpeg__process_marker(rjpeg__jpeg * z,int m)1580 static int rjpeg__process_marker(rjpeg__jpeg *z, int m)
1581 {
1582 int L;
1583 switch (m)
1584 {
1585 case RJPEG__MARKER_none: /* no marker found */
1586 return rjpeg__err("expected marker","Corrupt JPEG");
1587
1588 case 0xDD: /* DRI - specify restart interval */
1589 if (rjpeg__get16be(z->s) != 4) return rjpeg__err("bad DRI len","Corrupt JPEG");
1590 z->restart_interval = rjpeg__get16be(z->s);
1591 return 1;
1592
1593 case 0xDB: /* DQT - define quantization table */
1594 L = rjpeg__get16be(z->s)-2;
1595 while (L > 0)
1596 {
1597 int q = rjpeg__get8(z->s);
1598 int p = q >> 4;
1599 int t = q & 15,i;
1600 if (p != 0)
1601 return rjpeg__err("bad DQT type","Corrupt JPEG");
1602 if (t > 3)
1603 return rjpeg__err("bad DQT table","Corrupt JPEG");
1604 for (i=0; i < 64; ++i)
1605 z->dequant[t][rjpeg__jpeg_dezigzag[i]] = rjpeg__get8(z->s);
1606 L -= 65;
1607 }
1608 return L==0;
1609
1610 case 0xC4: /* DHT - define huffman table */
1611 L = rjpeg__get16be(z->s)-2;
1612 while (L > 0)
1613 {
1614 int sizes[16],i,n=0;
1615 uint8_t *v = NULL;
1616 int q = rjpeg__get8(z->s);
1617 int tc = q >> 4;
1618 int th = q & 15;
1619 if (tc > 1 || th > 3)
1620 return rjpeg__err("bad DHT header","Corrupt JPEG");
1621
1622 for (i=0; i < 16; ++i)
1623 {
1624 sizes[i] = rjpeg__get8(z->s);
1625 n += sizes[i];
1626 }
1627 L -= 17;
1628
1629 if (tc == 0)
1630 {
1631 if (!rjpeg__build_huffman(z->huff_dc+th, sizes))
1632 return 0;
1633 v = z->huff_dc[th].values;
1634 }
1635 else
1636 {
1637 if (!rjpeg__build_huffman(z->huff_ac+th, sizes))
1638 return 0;
1639 v = z->huff_ac[th].values;
1640 }
1641 for (i=0; i < n; ++i)
1642 v[i] = rjpeg__get8(z->s);
1643 if (tc != 0)
1644 rjpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1645 L -= n;
1646 }
1647 return L==0;
1648 }
1649
1650 /* check for comment block or APP blocks */
1651 if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1652 {
1653 rjpeg__skip(z->s, rjpeg__get16be(z->s)-2);
1654 return 1;
1655 }
1656 return 0;
1657 }
1658
1659 /* after we see SOS */
rjpeg__process_scan_header(rjpeg__jpeg * z)1660 static int rjpeg__process_scan_header(rjpeg__jpeg *z)
1661 {
1662 int i;
1663 int Ls = rjpeg__get16be(z->s);
1664
1665 z->scan_n = rjpeg__get8(z->s);
1666
1667 if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1668 return rjpeg__err("bad SOS component count","Corrupt JPEG");
1669 if (Ls != 6+2*z->scan_n)
1670 return rjpeg__err("bad SOS len","Corrupt JPEG");
1671
1672 for (i=0; i < z->scan_n; ++i)
1673 {
1674 int id = rjpeg__get8(z->s), which;
1675 int q = rjpeg__get8(z->s);
1676
1677 for (which = 0; which < z->s->img_n; ++which)
1678 if (z->img_comp[which].id == id)
1679 break;
1680 if (which == z->s->img_n)
1681 return 0; /* no match */
1682
1683 z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3)
1684 return rjpeg__err("bad DC huff","Corrupt JPEG");
1685 z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3)
1686 return rjpeg__err("bad AC huff","Corrupt JPEG");
1687 z->order[i] = which;
1688 }
1689
1690 {
1691 int aa;
1692 z->spec_start = rjpeg__get8(z->s);
1693 z->spec_end = rjpeg__get8(z->s); /* should be 63, but might be 0 */
1694 aa = rjpeg__get8(z->s);
1695 z->succ_high = (aa >> 4);
1696 z->succ_low = (aa & 15);
1697 if (z->progressive)
1698 {
1699 if ( z->spec_start > 63 ||
1700 z->spec_end > 63 ||
1701 z->spec_start > z->spec_end ||
1702 z->succ_high > 13 ||
1703 z->succ_low > 13)
1704 return rjpeg__err("bad SOS", "Corrupt JPEG");
1705 }
1706 else
1707 {
1708 if (z->spec_start != 0)
1709 return rjpeg__err("bad SOS","Corrupt JPEG");
1710 if (z->succ_high != 0 || z->succ_low != 0)
1711 return rjpeg__err("bad SOS","Corrupt JPEG");
1712 z->spec_end = 63;
1713 }
1714 }
1715
1716 return 1;
1717 }
1718
rjpeg__process_frame_header(rjpeg__jpeg * z,int scan)1719 static int rjpeg__process_frame_header(rjpeg__jpeg *z, int scan)
1720 {
1721 rjpeg__context *s = z->s;
1722 int Lf,p,i,q, h_max=1,v_max=1,c;
1723 Lf = rjpeg__get16be(s);
1724
1725 /* JPEG */
1726 if (Lf < 11)
1727 return rjpeg__err("bad SOF len","Corrupt JPEG");
1728
1729 p = rjpeg__get8(s);
1730
1731 /* JPEG baseline */
1732 if (p != 8)
1733 return rjpeg__err("only 8-bit","JPEG format not supported: 8-bit only");
1734
1735 s->img_y = rjpeg__get16be(s);
1736
1737 /* Legal, but we don't handle it--but neither does IJG */
1738 if (s->img_y == 0)
1739 return rjpeg__err("no header height", "JPEG format not supported: delayed height");
1740
1741 s->img_x = rjpeg__get16be(s);
1742
1743 if (s->img_x == 0)
1744 return rjpeg__err("0 width","Corrupt JPEG"); /* JPEG requires */
1745
1746 c = rjpeg__get8(s);
1747
1748 /* JFIF requires */
1749 if (c != 3 && c != 1)
1750 return rjpeg__err("bad component count","Corrupt JPEG");
1751
1752 s->img_n = c;
1753
1754 for (i=0; i < c; ++i)
1755 {
1756 z->img_comp[i].data = NULL;
1757 z->img_comp[i].linebuf = NULL;
1758 }
1759
1760 if (Lf != 8+3*s->img_n)
1761 return rjpeg__err("bad SOF len","Corrupt JPEG");
1762
1763 for (i=0; i < s->img_n; ++i)
1764 {
1765 z->img_comp[i].id = rjpeg__get8(s);
1766 if (z->img_comp[i].id != i+1) /* JFIF requires */
1767 if (z->img_comp[i].id != i) /* some version of jpegtran outputs non-JFIF-compliant files! */
1768 return rjpeg__err("bad component ID","Corrupt JPEG");
1769 q = rjpeg__get8(s);
1770 z->img_comp[i].h = (q >> 4);
1771 if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1772 return rjpeg__err("bad H","Corrupt JPEG");
1773 z->img_comp[i].v = q & 15;
1774 if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1775 return rjpeg__err("bad V","Corrupt JPEG");
1776 z->img_comp[i].tq = rjpeg__get8(s);
1777 if (z->img_comp[i].tq > 3)
1778 return rjpeg__err("bad TQ","Corrupt JPEG");
1779 }
1780
1781 if (scan != RJPEG_SCAN_LOAD) return 1;
1782
1783 if ((1 << 30) / s->img_x / s->img_n < s->img_y) return rjpeg__err("too large", "Image too large to decode");
1784
1785 for (i=0; i < s->img_n; ++i)
1786 {
1787 if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
1788 if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
1789 }
1790
1791 /* compute interleaved MCU info */
1792 z->img_h_max = h_max;
1793 z->img_v_max = v_max;
1794 z->img_mcu_w = h_max * 8;
1795 z->img_mcu_h = v_max * 8;
1796 z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1797 z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1798
1799 for (i=0; i < s->img_n; ++i)
1800 {
1801 /* number of effective pixels (e.g. for non-interleaved MCU) */
1802 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1803 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1804 /* to simplify generation, we'll allocate enough memory to decode
1805 * the bogus oversized data from using interleaved MCUs and their
1806 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1807 * discard the extra data until colorspace conversion */
1808 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1809 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1810 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1811
1812 if (z->img_comp[i].raw_data == NULL)
1813 {
1814 for(--i; i >= 0; --i)
1815 {
1816 free(z->img_comp[i].raw_data);
1817 z->img_comp[i].data = NULL;
1818 }
1819 return rjpeg__err("outofmem", "Out of memory");
1820 }
1821
1822 /* align blocks for IDCT using MMX/SSE */
1823 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1824 z->img_comp[i].linebuf = NULL;
1825 if (z->progressive)
1826 {
1827 z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1828 z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1829 z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1830 z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1831 } else {
1832 z->img_comp[i].coeff = 0;
1833 z->img_comp[i].raw_coeff = 0;
1834 }
1835 }
1836
1837 return 1;
1838 }
1839
1840
rjpeg__decode_jpeg_header(rjpeg__jpeg * z,int scan)1841 static int rjpeg__decode_jpeg_header(rjpeg__jpeg *z, int scan)
1842 {
1843 int m;
1844 z->marker = RJPEG__MARKER_none; /* initialize cached marker to empty */
1845 m = rjpeg__get_marker(z);
1846
1847 if (!rjpeg__SOI(m))
1848 return rjpeg__err("no SOI","Corrupt JPEG");
1849
1850 if (scan == RJPEG_SCAN_TYPE)
1851 return 1;
1852
1853 m = rjpeg__get_marker(z);
1854 while (!rjpeg__SOF(m))
1855 {
1856 if (!rjpeg__process_marker(z,m))
1857 return 0;
1858 m = rjpeg__get_marker(z);
1859 while (m == RJPEG__MARKER_none)
1860 {
1861 /* some files have extra padding after their blocks, so ok, we'll scan */
1862 if (rjpeg__at_eof(z->s))
1863 return rjpeg__err("no SOF", "Corrupt JPEG");
1864 m = rjpeg__get_marker(z);
1865 }
1866 }
1867 z->progressive = rjpeg__SOF_progressive(m);
1868 if (!rjpeg__process_frame_header(z, scan)) return 0;
1869 return 1;
1870 }
1871
1872 /* decode image to YCbCr format */
rjpeg__decode_jpeg_image(rjpeg__jpeg * j)1873 static int rjpeg__decode_jpeg_image(rjpeg__jpeg *j)
1874 {
1875 int m;
1876 for (m = 0; m < 4; m++)
1877 {
1878 j->img_comp[m].raw_data = NULL;
1879 j->img_comp[m].raw_coeff = NULL;
1880 }
1881 j->restart_interval = 0;
1882 if (!rjpeg__decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1883 return 0;
1884 m = rjpeg__get_marker(j);
1885
1886 while (!rjpeg__EOI(m))
1887 {
1888 if (rjpeg__SOS(m))
1889 {
1890 if (!rjpeg__process_scan_header(j))
1891 return 0;
1892 if (!rjpeg__parse_entropy_coded_data(j))
1893 return 0;
1894
1895 if (j->marker == RJPEG__MARKER_none )
1896 {
1897 /* handle 0s at the end of image data from IP Kamera 9060 */
1898 while (!rjpeg__at_eof(j->s))
1899 {
1900 int x = rjpeg__get8(j->s);
1901 if (x == 255)
1902 {
1903 j->marker = rjpeg__get8(j->s);
1904 break;
1905 }
1906 else if (x != 0)
1907 return rjpeg__err("junk before marker", "Corrupt JPEG");
1908 }
1909 /* if we reach eof without hitting a marker, rjpeg__get_marker() below will fail and we'll eventually return 0 */
1910 }
1911 }
1912 else
1913 {
1914 if (!rjpeg__process_marker(j, m))
1915 return 0;
1916 }
1917 m = rjpeg__get_marker(j);
1918 }
1919
1920 if (j->progressive)
1921 rjpeg__jpeg_finish(j);
1922 return 1;
1923 }
1924
1925 /* static jfif-centered resampling (across block boundaries) */
1926
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1927 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1928 {
1929 (void)out;
1930 (void)in_far;
1931 (void)w;
1932 (void)hs;
1933 return in_near;
1934 }
1935
rjpeg__resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1936 static uint8_t* rjpeg__resample_row_v_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1937 {
1938 /* need to generate two samples vertically for every one in input */
1939 int i;
1940 (void)hs;
1941 for (i=0; i < w; ++i)
1942 out[i] = rjpeg__div4(3*in_near[i] + in_far[i] + 2);
1943 return out;
1944 }
1945
rjpeg__resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1946 static uint8_t* rjpeg__resample_row_h_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1947 {
1948 /* need to generate two samples horizontally for every one in input */
1949 int i;
1950 uint8_t *input = in_near;
1951
1952 if (w == 1)
1953 {
1954 /* if only one sample, can't do any interpolation */
1955 out[0] = out[1] = input[0];
1956 return out;
1957 }
1958
1959 out[0] = input[0];
1960 out[1] = rjpeg__div4(input[0]*3 + input[1] + 2);
1961
1962 for (i=1; i < w-1; ++i)
1963 {
1964 int n = 3*input[i]+2;
1965 out[i*2+0] = rjpeg__div4(n+input[i-1]);
1966 out[i*2+1] = rjpeg__div4(n+input[i+1]);
1967 }
1968 out[i*2+0] = rjpeg__div4(input[w-2]*3 + input[w-1] + 2);
1969 out[i*2+1] = input[w-1];
1970
1971 (void)in_far;
1972 (void)hs;
1973
1974 return out;
1975 }
1976
1977
rjpeg__resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1978 static uint8_t *rjpeg__resample_row_hv_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
1979 {
1980 /* need to generate 2x2 samples for every one in input */
1981 int i,t0,t1;
1982 if (w == 1)
1983 {
1984 out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
1985 return out;
1986 }
1987
1988 t1 = 3*in_near[0] + in_far[0];
1989 out[0] = rjpeg__div4(t1+2);
1990 for (i=1; i < w; ++i)
1991 {
1992 t0 = t1;
1993 t1 = 3*in_near[i]+in_far[i];
1994 out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
1995 out[i*2 ] = rjpeg__div16(3*t1 + t0 + 8);
1996 }
1997 out[w*2-1] = rjpeg__div4(t1+2);
1998
1999 (void)hs;
2000
2001 return out;
2002 }
2003
2004 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2005 static uint8_t *rjpeg__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
2006 {
2007 /* need to generate 2x2 samples for every one in input */
2008 int i=0,t0,t1;
2009
2010 if (w == 1)
2011 {
2012 out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2013 return out;
2014 }
2015
2016 t1 = 3*in_near[0] + in_far[0];
2017 /* process groups of 8 pixels for as long as we can.
2018 * note we can't handle the last pixel in a row in this loop
2019 * because we need to handle the filter boundary conditions.
2020 */
2021 for (; i < ((w-1) & ~7); i += 8)
2022 {
2023 #if defined(__SSE2__)
2024 /* load and perform the vertical filtering pass
2025 * this uses 3*x + y = 4*x + (y - x) */
2026 __m128i zero = _mm_setzero_si128();
2027 __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
2028 __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2029 __m128i farw = _mm_unpacklo_epi8(farb, zero);
2030 __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2031 __m128i diff = _mm_sub_epi16(farw, nearw);
2032 __m128i nears = _mm_slli_epi16(nearw, 2);
2033 __m128i curr = _mm_add_epi16(nears, diff); /* current row */
2034
2035 /* horizontal filter works the same based on shifted vers of current
2036 * row. "prev" is current row shifted right by 1 pixel; we need to
2037 * insert the previous pixel value (from t1).
2038 * "next" is current row shifted left by 1 pixel, with first pixel
2039 * of next block of 8 pixels added in.
2040 */
2041 __m128i prv0 = _mm_slli_si128(curr, 2);
2042 __m128i nxt0 = _mm_srli_si128(curr, 2);
2043 __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2044 __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2045
2046 /* horizontal filter, polyphase implementation since it's convenient:
2047 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2048 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2049 * note the shared term. */
2050 __m128i bias = _mm_set1_epi16(8);
2051 __m128i curs = _mm_slli_epi16(curr, 2);
2052 __m128i prvd = _mm_sub_epi16(prev, curr);
2053 __m128i nxtd = _mm_sub_epi16(next, curr);
2054 __m128i curb = _mm_add_epi16(curs, bias);
2055 __m128i even = _mm_add_epi16(prvd, curb);
2056 __m128i odd = _mm_add_epi16(nxtd, curb);
2057
2058 /* interleave even and odd pixels, then undo scaling. */
2059 __m128i int0 = _mm_unpacklo_epi16(even, odd);
2060 __m128i int1 = _mm_unpackhi_epi16(even, odd);
2061 __m128i de0 = _mm_srli_epi16(int0, 4);
2062 __m128i de1 = _mm_srli_epi16(int1, 4);
2063
2064 /* pack and write output */
2065 __m128i outv = _mm_packus_epi16(de0, de1);
2066 _mm_storeu_si128((__m128i *) (out + i*2), outv);
2067 #elif defined(RJPEG_NEON)
2068 /* load and perform the vertical filtering pass
2069 * this uses 3*x + y = 4*x + (y - x) */
2070 uint8x8_t farb = vld1_u8(in_far + i);
2071 uint8x8_t nearb = vld1_u8(in_near + i);
2072 int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2073 int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2074 int16x8_t curr = vaddq_s16(nears, diff); /* current row */
2075
2076 /* horizontal filter works the same based on shifted vers of current
2077 * row. "prev" is current row shifted right by 1 pixel; we need to
2078 * insert the previous pixel value (from t1).
2079 * "next" is current row shifted left by 1 pixel, with first pixel
2080 * of next block of 8 pixels added in. */
2081 int16x8_t prv0 = vextq_s16(curr, curr, 7);
2082 int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2083 int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2084 int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2085
2086 /* horizontal filter, polyphase implementation since it's convenient:
2087 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2088 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2089 * note the shared term.
2090 */
2091 int16x8_t curs = vshlq_n_s16(curr, 2);
2092 int16x8_t prvd = vsubq_s16(prev, curr);
2093 int16x8_t nxtd = vsubq_s16(next, curr);
2094 int16x8_t even = vaddq_s16(curs, prvd);
2095 int16x8_t odd = vaddq_s16(curs, nxtd);
2096
2097 /* undo scaling and round, then store with even/odd phases interleaved */
2098 uint8x8x2_t o;
2099 o.val[0] = vqrshrun_n_s16(even, 4);
2100 o.val[1] = vqrshrun_n_s16(odd, 4);
2101 vst2_u8(out + i*2, o);
2102 #endif
2103
2104 /* "previous" value for next iteration */
2105 t1 = 3*in_near[i+7] + in_far[i+7];
2106 }
2107
2108 t0 = t1;
2109 t1 = 3*in_near[i] + in_far[i];
2110 out[i*2] = rjpeg__div16(3*t1 + t0 + 8);
2111
2112 for (++i; i < w; ++i)
2113 {
2114 t0 = t1;
2115 t1 = 3*in_near[i]+in_far[i];
2116 out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2117 out[i*2 ] = rjpeg__div16(3*t1 + t0 + 8);
2118 }
2119 out[w*2-1] = rjpeg__div4(t1+2);
2120
2121 (void)hs;
2122
2123 return out;
2124 }
2125 #endif
2126
rjpeg__resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2127 static uint8_t *rjpeg__resample_row_generic(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs)
2128 {
2129 /* resample with nearest-neighbor */
2130 int i,j;
2131 (void)in_far;
2132
2133 for (i=0; i < w; ++i)
2134 for (j=0; j < hs; ++j)
2135 out[i*hs+j] = in_near[i];
2136 return out;
2137 }
2138
2139 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2140 * to make sure the code produces the same results in both SIMD and scalar */
2141 #ifndef float2fixed
2142 #define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
2143 #endif
2144
rjpeg__YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2145 static void rjpeg__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2146 {
2147 int i;
2148 for (i=0; i < count; ++i)
2149 {
2150 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2151 int cr = pcr[i] - 128;
2152 int cb = pcb[i] - 128;
2153 int r = y_fixed + cr* float2fixed(1.40200f);
2154 int g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2155 int b = y_fixed + cb* float2fixed(1.77200f);
2156 r >>= 20;
2157 g >>= 20;
2158 b >>= 20;
2159 if ((unsigned) r > 255)
2160 r = 255;
2161 if ((unsigned) g > 255)
2162 g = 255;
2163 if ((unsigned) b > 255)
2164 b = 255;
2165 out[0] = (uint8_t)r;
2166 out[1] = (uint8_t)g;
2167 out[2] = (uint8_t)b;
2168 out[3] = 255;
2169 out += step;
2170 }
2171 }
2172
2173 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2174 static void rjpeg__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2175 {
2176 int i = 0;
2177
2178 #if defined(__SSE2__)
2179 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2180 * it's useful in practice (you wouldn't use it for textures, for example).
2181 * so just accelerate step == 4 case.
2182 */
2183 if (step == 4)
2184 {
2185 /* this is a fairly straightforward implementation and not super-optimized. */
2186 __m128i signflip = _mm_set1_epi8(-0x80);
2187 __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
2188 __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2189 __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2190 __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
2191 __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2192 __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2193
2194 for (; i+7 < count; i += 8)
2195 {
2196 /* load */
2197 __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2198 __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2199 __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2200 __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2201 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2202
2203 /* unpack to short (and left-shift cr, cb by 8) */
2204 __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
2205 __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2206 __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2207
2208 /* color transform */
2209 __m128i yws = _mm_srli_epi16(yw, 4);
2210 __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2211 __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2212 __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2213 __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2214 __m128i rws = _mm_add_epi16(cr0, yws);
2215 __m128i gwt = _mm_add_epi16(cb0, yws);
2216 __m128i bws = _mm_add_epi16(yws, cb1);
2217 __m128i gws = _mm_add_epi16(gwt, cr1);
2218
2219 /* descale */
2220 __m128i rw = _mm_srai_epi16(rws, 4);
2221 __m128i bw = _mm_srai_epi16(bws, 4);
2222 __m128i gw = _mm_srai_epi16(gws, 4);
2223
2224 /* back to byte, set up for transpose */
2225 __m128i brb = _mm_packus_epi16(rw, bw);
2226 __m128i gxb = _mm_packus_epi16(gw, xw);
2227
2228 /* transpose to interleave channels */
2229 __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2230 __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2231 __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2232 __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2233
2234 /* store */
2235 _mm_storeu_si128((__m128i *) (out + 0), o0);
2236 _mm_storeu_si128((__m128i *) (out + 16), o1);
2237 out += 32;
2238 }
2239 }
2240 #endif
2241
2242 #ifdef RJPEG_NEON
2243 /* in this version, step=3 support would be easy to add. but is there demand? */
2244 if (step == 4)
2245 {
2246 /* this is a fairly straightforward implementation and not super-optimized. */
2247 uint8x8_t signflip = vdup_n_u8(0x80);
2248 int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
2249 int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2250 int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2251 int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
2252
2253 for (; i+7 < count; i += 8)
2254 {
2255 uint8x8x4_t o;
2256
2257 /* load */
2258 uint8x8_t y_bytes = vld1_u8(y + i);
2259 uint8x8_t cr_bytes = vld1_u8(pcr + i);
2260 uint8x8_t cb_bytes = vld1_u8(pcb + i);
2261 int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2262 int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2263
2264 /* expand to s16 */
2265 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2266 int16x8_t crw = vshll_n_s8(cr_biased, 7);
2267 int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2268
2269 /* color transform */
2270 int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2271 int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2272 int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2273 int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2274 int16x8_t rws = vaddq_s16(yws, cr0);
2275 int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2276 int16x8_t bws = vaddq_s16(yws, cb1);
2277
2278 /* undo scaling, round, convert to byte */
2279 o.val[0] = vqrshrun_n_s16(rws, 4);
2280 o.val[1] = vqrshrun_n_s16(gws, 4);
2281 o.val[2] = vqrshrun_n_s16(bws, 4);
2282 o.val[3] = vdup_n_u8(255);
2283
2284 /* store, interleaving r/g/b/a */
2285 vst4_u8(out, o);
2286 out += 8*4;
2287 }
2288 }
2289 #endif
2290
2291 for (; i < count; ++i)
2292 {
2293 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2294 int cr = pcr[i] - 128;
2295 int cb = pcb[i] - 128;
2296 int r = y_fixed + cr* float2fixed(1.40200f);
2297 int g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2298 int b = y_fixed + cb* float2fixed(1.77200f);
2299 r >>= 20;
2300 g >>= 20;
2301 b >>= 20;
2302 if ((unsigned) r > 255)
2303 r = 255;
2304 if ((unsigned) g > 255)
2305 g = 255;
2306 if ((unsigned) b > 255)
2307 b = 255;
2308 out[0] = (uint8_t)r;
2309 out[1] = (uint8_t)g;
2310 out[2] = (uint8_t)b;
2311 out[3] = 255;
2312 out += step;
2313 }
2314 }
2315 #endif
2316
2317 /* set up the kernels */
rjpeg__setup_jpeg(rjpeg__jpeg * j)2318 static void rjpeg__setup_jpeg(rjpeg__jpeg *j)
2319 {
2320 uint64_t mask = cpu_features_get();
2321
2322 (void)mask;
2323
2324 j->idct_block_kernel = rjpeg__idct_block;
2325 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_row;
2326 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2;
2327
2328
2329 #if defined(__SSE2__)
2330 if (mask & RETRO_SIMD_SSE2)
2331 {
2332 j->idct_block_kernel = rjpeg__idct_simd;
2333 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_simd;
2334 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2335 }
2336 #endif
2337
2338 #ifdef RJPEG_NEON
2339 j->idct_block_kernel = rjpeg__idct_simd;
2340 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_simd;
2341 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2342 #endif
2343 }
2344
2345 /* clean up the temporary component buffers */
rjpeg__cleanup_jpeg(rjpeg__jpeg * j)2346 static void rjpeg__cleanup_jpeg(rjpeg__jpeg *j)
2347 {
2348 int i;
2349 for (i=0; i < j->s->img_n; ++i)
2350 {
2351 if (j->img_comp[i].raw_data)
2352 {
2353 free(j->img_comp[i].raw_data);
2354 j->img_comp[i].raw_data = NULL;
2355 j->img_comp[i].data = NULL;
2356 }
2357
2358 if (j->img_comp[i].raw_coeff)
2359 {
2360 free(j->img_comp[i].raw_coeff);
2361 j->img_comp[i].raw_coeff = 0;
2362 j->img_comp[i].coeff = 0;
2363 }
2364
2365 if (j->img_comp[i].linebuf)
2366 {
2367 free(j->img_comp[i].linebuf);
2368 j->img_comp[i].linebuf = NULL;
2369 }
2370 }
2371 }
2372
rjpeg_load_jpeg_image(rjpeg__jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2373 static uint8_t *rjpeg_load_jpeg_image(rjpeg__jpeg *z, unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2374 {
2375 int n, decode_n;
2376 int k;
2377 unsigned int i,j;
2378 rjpeg__resample res_comp[4];
2379 uint8_t *coutput[4] = {0};
2380 uint8_t *output = NULL;
2381 z->s->img_n = 0; /* make rjpeg__cleanup_jpeg safe */
2382
2383 /* validate req_comp */
2384 if (req_comp < 0 || req_comp > 4)
2385 return rjpeg__errpuc("bad req_comp", "Internal error");
2386
2387 /* load a jpeg image from whichever source, but leave in YCbCr format */
2388 if (!rjpeg__decode_jpeg_image(z))
2389 goto error;
2390
2391 /* determine actual number of components to generate */
2392 n = req_comp ? req_comp : z->s->img_n;
2393
2394 if (z->s->img_n == 3 && n < 3)
2395 decode_n = 1;
2396 else
2397 decode_n = z->s->img_n;
2398
2399 /* resample and color-convert */
2400 for (k=0; k < decode_n; ++k)
2401 {
2402 rjpeg__resample *r = &res_comp[k];
2403
2404 /* allocate line buffer big enough for upsampling off the edges
2405 * with upsample factor of 4 */
2406 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2407 if (!z->img_comp[k].linebuf)
2408 goto error;
2409
2410 r->hs = z->img_h_max / z->img_comp[k].h;
2411 r->vs = z->img_v_max / z->img_comp[k].v;
2412 r->ystep = r->vs >> 1;
2413 r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
2414 r->ypos = 0;
2415 r->line0 = r->line1 = z->img_comp[k].data;
2416 r->resample = rjpeg__resample_row_generic;
2417
2418 if (r->hs == 1 && r->vs == 1)
2419 r->resample = rjpeg_resample_row_1;
2420 else if (r->hs == 1 && r->vs == 2)
2421 r->resample = rjpeg__resample_row_v_2;
2422 else if (r->hs == 2 && r->vs == 1)
2423 r->resample = rjpeg__resample_row_h_2;
2424 else if (r->hs == 2 && r->vs == 2)
2425 r->resample = z->resample_row_hv_2_kernel;
2426 }
2427
2428 /* can't error after this so, this is safe */
2429 output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2430
2431 if (!output)
2432 goto error;
2433
2434 /* now go ahead and resample */
2435 for (j=0; j < z->s->img_y; ++j)
2436 {
2437 uint8_t *out = output + n * z->s->img_x * j;
2438 for (k=0; k < decode_n; ++k)
2439 {
2440 rjpeg__resample *r = &res_comp[k];
2441 int y_bot = r->ystep >= (r->vs >> 1);
2442
2443 coutput[k] = r->resample(z->img_comp[k].linebuf,
2444 y_bot ? r->line1 : r->line0,
2445 y_bot ? r->line0 : r->line1,
2446 r->w_lores, r->hs);
2447
2448 if (++r->ystep >= r->vs)
2449 {
2450 r->ystep = 0;
2451 r->line0 = r->line1;
2452 if (++r->ypos < z->img_comp[k].y)
2453 r->line1 += z->img_comp[k].w2;
2454 }
2455 }
2456
2457 if (n >= 3)
2458 {
2459 uint8_t *y = coutput[0];
2460 if (z->s->img_n == 3)
2461 z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2462 else
2463 for (i=0; i < z->s->img_x; ++i)
2464 {
2465 out[0] = out[1] = out[2] = y[i];
2466 out[3] = 255; /* not used if n==3 */
2467 out += n;
2468 }
2469 }
2470 else
2471 {
2472 uint8_t *y = coutput[0];
2473 if (n == 1)
2474 for (i=0; i < z->s->img_x; ++i)
2475 out[i] = y[i];
2476 else
2477 for (i=0; i < z->s->img_x; ++i)
2478 *out++ = y[i], *out++ = 255;
2479 }
2480 }
2481
2482 rjpeg__cleanup_jpeg(z);
2483 *out_x = z->s->img_x;
2484 *out_y = z->s->img_y;
2485
2486 if (comp)
2487 *comp = z->s->img_n; /* report original components, not output */
2488 return output;
2489
2490 error:
2491 rjpeg__cleanup_jpeg(z);
2492 return NULL;
2493 }
2494
rjpeg__jpeg_load(rjpeg__context * s,unsigned * x,unsigned * y,int * comp,int req_comp)2495 static unsigned char *rjpeg__jpeg_load(rjpeg__context *s, unsigned *x, unsigned *y, int *comp, int req_comp)
2496 {
2497 rjpeg__jpeg j;
2498 j.s = s;
2499 rjpeg__setup_jpeg(&j);
2500 return rjpeg_load_jpeg_image(&j, x,y,comp,req_comp);
2501 }
2502
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2503 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2504 size_t size, unsigned *width, unsigned *height)
2505 {
2506 int comp;
2507 uint32_t *img = NULL;
2508 uint32_t *pixels = NULL;
2509 unsigned size_tex = 0;
2510
2511 if (!rjpeg)
2512 return IMAGE_PROCESS_ERROR;
2513
2514 img = (uint32_t*)rjpeg_load_from_memory(rjpeg->buff_data, size, width, height, &comp, 4);
2515
2516 if (!img)
2517 return IMAGE_PROCESS_ERROR;
2518
2519 size_tex = (*width) * (*height);
2520 pixels = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2521
2522 if (!pixels)
2523 {
2524 free(img);
2525 return IMAGE_PROCESS_ERROR;
2526 }
2527
2528 *buf_data = pixels;
2529
2530 /* Convert RGBA to ARGB */
2531 while (size_tex--)
2532 {
2533 unsigned int texel = img[size_tex];
2534 unsigned int A = texel & 0xFF000000;
2535 unsigned int B = texel & 0x00FF0000;
2536 unsigned int G = texel & 0x0000FF00;
2537 unsigned int R = texel & 0x000000FF;
2538 ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2539 };
2540
2541 free(img);
2542
2543 return IMAGE_PROCESS_END;
2544 }
2545
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2546 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2547 {
2548 if (!rjpeg)
2549 return false;
2550
2551 rjpeg->buff_data = (uint8_t*)data;
2552
2553 return true;
2554 }
2555
rjpeg_free(rjpeg_t * rjpeg)2556 void rjpeg_free(rjpeg_t *rjpeg)
2557 {
2558 if (!rjpeg)
2559 return;
2560
2561 free(rjpeg);
2562 }
2563
rjpeg_alloc(void)2564 rjpeg_t *rjpeg_alloc(void)
2565 {
2566 rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2567 if (!rjpeg)
2568 return NULL;
2569 return rjpeg;
2570 }
2571