1 /* Copyright (C) 2010-2018 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (rjpeg.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 /* Modified version of stb_image's JPEG sources. */
24
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37
38 enum
39 {
40 RJPEG_DEFAULT = 0, /* only used for req_comp */
41 RJPEG_GREY,
42 RJPEG_GREY_ALPHA,
43 RJPEG_RGB,
44 RJPEG_RGB_ALPHA
45 };
46
47 enum
48 {
49 RJPEG_SCAN_LOAD = 0,
50 RJPEG_SCAN_TYPE,
51 RJPEG_SCAN_HEADER
52 };
53
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55 int w, int hs);
56
57 typedef struct
58 {
59 rjpeg_resample_row_func resample;
60 uint8_t *line0;
61 uint8_t *line1;
62 int hs,vs; /* expansion factor in each axis */
63 int w_lores; /* horizontal pixels pre-expansion */
64 int ystep; /* how far through vertical expansion we are */
65 int ypos; /* which pre-expansion row we're on */
66 } rjpeg__resample;
67
68 struct rjpeg
69 {
70 uint8_t *buff_data;
71 };
72
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76
77 #ifdef RJPEG_HAS_LROTL
78 #define rjpeg_lrot(x,y) _lrotl(x,y)
79 #else
80 #define rjpeg_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG__X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG__X86_TARGET
88 #endif
89
90 #if defined(__GNUC__) && (defined(RJPEG__X86_TARGET) || defined(RJPEG__X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92 * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93 * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94 * this is just broken and gcc are jerks for not fixing it properly
95 * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96 */
97 #define RJPEG_NO_SIMD
98 #endif
99
100 #if defined(__MINGW32__) && defined(RJPEG__X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG__X64_TARGET
102 *
103 * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104 * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105 * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106 * simultaneously enabling "-mstackrealign".
107 *
108 * See https://github.com/nothings/stb/issues/81 for more information.
109 *
110 * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111 * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112 */
113 #define RJPEG_NO_SIMD
114 #endif
115
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124
125 #endif
126
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141
142 typedef struct
143 {
144 uint32_t img_x;
145 uint32_t img_y;
146 int img_n;
147 int img_out_n;
148
149 int buflen;
150 uint8_t buffer_start[128];
151
152 uint8_t *img_buffer;
153 uint8_t *img_buffer_end;
154 uint8_t *img_buffer_original;
155 } rjpeg__context;
156
rjpeg__get8(rjpeg__context * s)157 static INLINE uint8_t rjpeg__get8(rjpeg__context *s)
158 {
159 if (s->img_buffer < s->img_buffer_end)
160 return *s->img_buffer++;
161
162 return 0;
163 }
164
165 #define RJPEG__AT_EOF(s) ((s)->img_buffer >= (s)->img_buffer_end)
166
167 #define RJPEG__GET16BE(s) ((rjpeg__get8((s)) << 8) + rjpeg__get8((s)))
168
169 #define RJPEG__BYTECAST(x) ((uint8_t) ((x) & 255)) /* truncate int to byte without warnings */
170
171 /* huffman decoding acceleration */
172 #define FAST_BITS 9 /* larger handles more cases; smaller stomps less cache */
173
174 typedef struct
175 {
176 uint8_t fast[1 << FAST_BITS];
177 /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
178 uint16_t code[256];
179 uint8_t values[256];
180 uint8_t size[257];
181 unsigned int maxcode[18];
182 int delta[17]; /* old 'firstsymbol' - old 'firstcode' */
183 } rjpeg__huffman;
184
185 typedef struct
186 {
187 rjpeg__context *s;
188 rjpeg__huffman huff_dc[4];
189 rjpeg__huffman huff_ac[4];
190 uint8_t dequant[4][64];
191 int16_t fast_ac[4][1 << FAST_BITS];
192
193 /* sizes for components, interleaved MCUs */
194 int img_h_max, img_v_max;
195 int img_mcu_x, img_mcu_y;
196 int img_mcu_w, img_mcu_h;
197
198 /* definition of jpeg image component */
199 struct
200 {
201 int id;
202 int h,v;
203 int tq;
204 int hd,ha;
205 int dc_pred;
206
207 int x,y,w2,h2;
208 uint8_t *data;
209 void *raw_data, *raw_coeff;
210 uint8_t *linebuf;
211 short *coeff; /* progressive only */
212 int coeff_w; /* number of 8x8 coefficient blocks */
213 int coeff_h; /* number of 8x8 coefficient blocks */
214 } img_comp[4];
215
216 uint32_t code_buffer; /* jpeg entropy-coded buffer */
217 int code_bits; /* number of valid bits */
218 unsigned char marker; /* marker seen while filling entropy buffer */
219 int nomore; /* flag if we saw a marker so must stop */
220
221 int progressive;
222 int spec_start;
223 int spec_end;
224 int succ_high;
225 int succ_low;
226 int eob_run;
227
228 int scan_n, order[4];
229 int restart_interval, todo;
230
231 /* kernels */
232 void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
233 void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
234 const uint8_t *pcr, int count, int step);
235 uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
236 uint8_t *in_far, int w, int hs);
237 } rjpeg__jpeg;
238
239 #define rjpeg__f2f(x) ((int) (((x) * 4096 + 0.5)))
240 #define rjpeg__fsh(x) ((x) << 12)
241
242 #define RJPEG__MARKER_NONE 0xff
243 /* if there's a pending marker from the entropy stream, return that
244 * otherwise, fetch from the stream and get a marker. if there's no
245 * marker, return 0xff, which is never a valid marker value
246 */
247
248 /* in each scan, we'll have scan_n components, and the order
249 * of the components is specified by order[]
250 */
251 #define RJPEG__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
252
253 #define JPEG_MARKER 0xFF
254 #define JPEG_MARKER_SOI 0xD8
255 #define JPEG_MARKER_SOS 0xDA
256 #define JPEG_MARKER_EOI 0xD9
257 #define JPEG_MARKER_APP1 0xE1
258 #define JPEG_MARKER_APP2 0xE2
259
260 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
261 #define rjpeg__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
262
263 #define rjpeg__SOF_progressive(x) ((x) == 0xc2)
264 #define rjpeg__div4(x) ((uint8_t) ((x) >> 2))
265 #define rjpeg__div16(x) ((uint8_t) ((x) >> 4))
266
rjpeg__build_huffman(rjpeg__huffman * h,int * count)267 static int rjpeg__build_huffman(rjpeg__huffman *h, int *count)
268 {
269 int i,j,k=0,code;
270
271 /* build size list for each symbol (from JPEG spec) */
272 for (i=0; i < 16; ++i)
273 for (j=0; j < count[i]; ++j)
274 h->size[k++] = (uint8_t) (i+1);
275
276 h->size[k] = 0;
277 /* compute actual symbols (from jpeg spec) */
278 code = 0;
279 k = 0;
280
281 for(j=1; j <= 16; ++j)
282 {
283 /* compute delta to add to code to compute symbol id */
284 h->delta[j] = k - code;
285 if (h->size[k] == j)
286 {
287 while (h->size[k] == j)
288 h->code[k++] = (uint16_t) (code++);
289
290 /* Bad code lengths, corrupt JPEG? */
291 if (code-1 >= (1 << j))
292 return 0;
293 }
294 /* compute largest code + 1 for this size, preshifted as needed later */
295 h->maxcode[j] = code << (16-j);
296 code <<= 1;
297 }
298 h->maxcode[j] = 0xffffffff;
299
300 /* build non-spec acceleration table; 255 is flag for not-accelerated */
301 memset(h->fast, 255, 1 << FAST_BITS);
302 for (i=0; i < k; ++i)
303 {
304 int s = h->size[i];
305 if (s <= FAST_BITS)
306 {
307 int c = h->code[i] << (FAST_BITS-s);
308 int m = 1 << (FAST_BITS-s);
309 for (j=0; j < m; ++j)
310 h->fast[c+j] = (uint8_t) i;
311 }
312 }
313 return 1;
314 }
315
316 /* build a table that decodes both magnitude and value of small ACs in
317 * one go. */
rjpeg__build_fast_ac(int16_t * fast_ac,rjpeg__huffman * h)318 static void rjpeg__build_fast_ac(int16_t *fast_ac, rjpeg__huffman *h)
319 {
320 int i;
321
322 for (i=0; i < (1 << FAST_BITS); ++i)
323 {
324 uint8_t fast = h->fast[i];
325
326 fast_ac[i] = 0;
327
328 if (fast < 255)
329 {
330 int rs = h->values[fast];
331 int run = (rs >> 4) & 15;
332 int magbits = rs & 15;
333 int len = h->size[fast];
334
335 if (magbits && len + magbits <= FAST_BITS)
336 {
337 /* magnitude code followed by receive_extend code */
338 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
339 int m = 1 << (magbits - 1);
340 if (k < m)
341 k += (-1 << magbits) + 1;
342
343 /* if the result is small enough, we can fit it in fast_ac table */
344 if (k >= -128 && k <= 127)
345 fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
346 }
347 }
348 }
349 }
350
rjpeg__grow_buffer_unsafe(rjpeg__jpeg * j)351 static void rjpeg__grow_buffer_unsafe(rjpeg__jpeg *j)
352 {
353 do
354 {
355 int b = j->nomore ? 0 : rjpeg__get8(j->s);
356 if (b == 0xff)
357 {
358 int c = rjpeg__get8(j->s);
359
360 if (c != 0)
361 {
362 j->marker = (unsigned char) c;
363 j->nomore = 1;
364 return;
365 }
366 }
367 j->code_buffer |= b << (24 - j->code_bits);
368 j->code_bits += 8;
369 } while (j->code_bits <= 24);
370 }
371
372 /* (1 << n) - 1 */
373 static uint32_t rjpeg__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
374
375 /* decode a JPEG huffman value from the bitstream */
rjpeg__jpeg_huff_decode(rjpeg__jpeg * j,rjpeg__huffman * h)376 static INLINE int rjpeg__jpeg_huff_decode(rjpeg__jpeg *j, rjpeg__huffman *h)
377 {
378 unsigned int temp;
379 int c,k;
380
381 if (j->code_bits < 16)
382 rjpeg__grow_buffer_unsafe(j);
383
384 /* look at the top FAST_BITS and determine what symbol ID it is,
385 * if the code is <= FAST_BITS */
386 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
387 k = h->fast[c];
388
389 if (k < 255)
390 {
391 int s = h->size[k];
392 if (s > j->code_bits)
393 return -1;
394 j->code_buffer <<= s;
395 j->code_bits -= s;
396 return h->values[k];
397 }
398
399 /* naive test is to shift the code_buffer down so k bits are
400 * valid, then test against maxcode. To speed this up, we've
401 * preshifted maxcode left so that it has (16-k) 0s at the
402 * end; in other words, regardless of the number of bits, it
403 * wants to be compared against something shifted to have 16;
404 * that way we don't need to shift inside the loop. */
405 temp = j->code_buffer >> 16;
406 for (k=FAST_BITS+1 ; ; ++k)
407 if (temp < h->maxcode[k])
408 break;
409
410 if (k == 17)
411 {
412 /* error! code not found */
413 j->code_bits -= 16;
414 return -1;
415 }
416
417 if (k > j->code_bits)
418 return -1;
419
420 /* convert the huffman code to the symbol id */
421 c = ((j->code_buffer >> (32 - k)) & rjpeg__bmask[k]) + h->delta[k];
422 retro_assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg__bmask[h->size[c]]) == h->code[c]);
423
424 /* convert the id to a symbol */
425 j->code_bits -= k;
426 j->code_buffer <<= k;
427 return h->values[c];
428 }
429
430 /* bias[n] = (-1<<n) + 1 */
431 static int const rjpeg__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
432
433 /* combined JPEG 'receive' and JPEG 'extend', since baseline
434 * always extends everything it receives. */
rjpeg__extend_receive(rjpeg__jpeg * j,int n)435 static INLINE int rjpeg__extend_receive(rjpeg__jpeg *j, int n)
436 {
437 unsigned int k;
438 int sgn;
439 if (j->code_bits < n)
440 rjpeg__grow_buffer_unsafe(j);
441
442 sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
443 k = rjpeg_lrot(j->code_buffer, n);
444 retro_assert(n >= 0 && n < (int) (sizeof(rjpeg__bmask)/sizeof(*rjpeg__bmask)));
445 j->code_buffer = k & ~rjpeg__bmask[n];
446 k &= rjpeg__bmask[n];
447 j->code_bits -= n;
448 return k + (rjpeg__jbias[n] & ~sgn);
449 }
450
451 /* get some unsigned bits */
rjpeg__jpeg_get_bits(rjpeg__jpeg * j,int n)452 static INLINE int rjpeg__jpeg_get_bits(rjpeg__jpeg *j, int n)
453 {
454 unsigned int k;
455 if (j->code_bits < n)
456 rjpeg__grow_buffer_unsafe(j);
457 k = rjpeg_lrot(j->code_buffer, n);
458 j->code_buffer = k & ~rjpeg__bmask[n];
459 k &= rjpeg__bmask[n];
460 j->code_bits -= n;
461 return k;
462 }
463
rjpeg__jpeg_get_bit(rjpeg__jpeg * j)464 static INLINE int rjpeg__jpeg_get_bit(rjpeg__jpeg *j)
465 {
466 unsigned int k;
467 if (j->code_bits < 1)
468 rjpeg__grow_buffer_unsafe(j);
469
470 k = j->code_buffer;
471 j->code_buffer <<= 1;
472 --j->code_bits;
473 return k & 0x80000000;
474 }
475
476 /* given a value that's at position X in the zigzag stream,
477 * where does it appear in the 8x8 matrix coded as row-major? */
478 static uint8_t rjpeg__jpeg_dezigzag[64+15] =
479 {
480 0, 1, 8, 16, 9, 2, 3, 10,
481 17, 24, 32, 25, 18, 11, 4, 5,
482 12, 19, 26, 33, 40, 48, 41, 34,
483 27, 20, 13, 6, 7, 14, 21, 28,
484 35, 42, 49, 56, 57, 50, 43, 36,
485 29, 22, 15, 23, 30, 37, 44, 51,
486 58, 59, 52, 45, 38, 31, 39, 46,
487 53, 60, 61, 54, 47, 55, 62, 63,
488 /* let corrupt input sample past end */
489 63, 63, 63, 63, 63, 63, 63, 63,
490 63, 63, 63, 63, 63, 63, 63
491 };
492
493 /* decode one 64-entry block-- */
rjpeg__jpeg_decode_block(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,rjpeg__huffman * hac,int16_t * fac,int b,uint8_t * dequant)494 static int rjpeg__jpeg_decode_block(
495 rjpeg__jpeg *j, short data[64],
496 rjpeg__huffman *hdc,
497 rjpeg__huffman *hac,
498 int16_t *fac,
499 int b,
500 uint8_t *dequant)
501 {
502 int diff,dc,k;
503 int t;
504
505 if (j->code_bits < 16)
506 rjpeg__grow_buffer_unsafe(j);
507 t = rjpeg__jpeg_huff_decode(j, hdc);
508
509 /* Bad huffman code. Corrupt JPEG? */
510 if (t < 0)
511 return 0;
512
513 /* 0 all the ac values now so we can do it 32-bits at a time */
514 memset(data,0,64*sizeof(data[0]));
515
516 diff = t ? rjpeg__extend_receive(j, t) : 0;
517 dc = j->img_comp[b].dc_pred + diff;
518 j->img_comp[b].dc_pred = dc;
519 data[0] = (short) (dc * dequant[0]);
520
521 /* decode AC components, see JPEG spec */
522 k = 1;
523 do
524 {
525 unsigned int zig;
526 int c,r,s;
527 if (j->code_bits < 16)
528 rjpeg__grow_buffer_unsafe(j);
529 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
530 r = fac[c];
531 if (r)
532 {
533 /* fast-AC path */
534 k += (r >> 4) & 15; /* run */
535 s = r & 15; /* combined length */
536 j->code_buffer <<= s;
537 j->code_bits -= s;
538 /* decode into unzigzag'd location */
539 zig = rjpeg__jpeg_dezigzag[k++];
540 data[zig] = (short) ((r >> 8) * dequant[zig]);
541 }
542 else
543 {
544 int rs = rjpeg__jpeg_huff_decode(j, hac);
545
546 /* Bad huffman code. Corrupt JPEG? */
547 if (rs < 0)
548 return 0;
549
550 s = rs & 15;
551 r = rs >> 4;
552 if (s == 0)
553 {
554 if (rs != 0xf0)
555 break; /* end block */
556 k += 16;
557 }
558 else
559 {
560 k += r;
561 /* decode into unzigzag'd location */
562 zig = rjpeg__jpeg_dezigzag[k++];
563 data[zig] = (short) (rjpeg__extend_receive(j,s) * dequant[zig]);
564 }
565 }
566 } while (k < 64);
567 return 1;
568 }
569
rjpeg__jpeg_decode_block_prog_dc(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hdc,int b)570 static int rjpeg__jpeg_decode_block_prog_dc(
571 rjpeg__jpeg *j,
572 short data[64],
573 rjpeg__huffman *hdc,
574 int b)
575 {
576 /* Can't merge DC and AC. Corrupt JPEG? */
577 if (j->spec_end != 0)
578 return 0;
579
580 if (j->code_bits < 16)
581 rjpeg__grow_buffer_unsafe(j);
582
583 if (j->succ_high == 0)
584 {
585 int t;
586 int diff,dc;
587
588 /* first scan for DC coefficient, must be first */
589 memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
590 t = rjpeg__jpeg_huff_decode(j, hdc);
591 diff = t ? rjpeg__extend_receive(j, t) : 0;
592
593 dc = j->img_comp[b].dc_pred + diff;
594 j->img_comp[b].dc_pred = dc;
595 data[0] = (short) (dc << j->succ_low);
596 }
597 else
598 {
599 /* refinement scan for DC coefficient */
600 if (rjpeg__jpeg_get_bit(j))
601 data[0] += (short) (1 << j->succ_low);
602 }
603 return 1;
604 }
605
rjpeg__jpeg_decode_block_prog_ac(rjpeg__jpeg * j,short data[64],rjpeg__huffman * hac,int16_t * fac)606 static int rjpeg__jpeg_decode_block_prog_ac(
607 rjpeg__jpeg *j,
608 short data[64],
609 rjpeg__huffman *hac,
610 int16_t *fac)
611 {
612 int k;
613
614 /* Can't merge DC and AC. Corrupt JPEG? */
615 if (j->spec_start == 0)
616 return 0;
617
618 if (j->succ_high == 0)
619 {
620 int shift = j->succ_low;
621
622 if (j->eob_run)
623 {
624 --j->eob_run;
625 return 1;
626 }
627
628 k = j->spec_start;
629 do
630 {
631 unsigned int zig;
632 int c,r,s;
633 if (j->code_bits < 16) rjpeg__grow_buffer_unsafe(j);
634 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
635 r = fac[c];
636 if (r)
637 { /* fast-AC path */
638 k += (r >> 4) & 15; /* run */
639 s = r & 15; /* combined length */
640 j->code_buffer <<= s;
641 j->code_bits -= s;
642 zig = rjpeg__jpeg_dezigzag[k++];
643 data[zig] = (short) ((r >> 8) << shift);
644 }
645 else
646 {
647 int rs = rjpeg__jpeg_huff_decode(j, hac);
648
649 /* Bad huffman code. Corrupt JPEG? */
650 if (rs < 0)
651 return 0;
652
653 s = rs & 15;
654 r = rs >> 4;
655 if (s == 0)
656 {
657 if (r < 15)
658 {
659 j->eob_run = (1 << r);
660 if (r)
661 j->eob_run += rjpeg__jpeg_get_bits(j, r);
662 --j->eob_run;
663 break;
664 }
665 k += 16;
666 }
667 else
668 {
669 k += r;
670 zig = rjpeg__jpeg_dezigzag[k++];
671 data[zig] = (short) (rjpeg__extend_receive(j,s) << shift);
672 }
673 }
674 } while (k <= j->spec_end);
675 }
676 else
677 {
678 /* refinement scan for these AC coefficients */
679
680 short bit = (short) (1 << j->succ_low);
681
682 if (j->eob_run)
683 {
684 --j->eob_run;
685 for (k = j->spec_start; k <= j->spec_end; ++k)
686 {
687 short *p = &data[rjpeg__jpeg_dezigzag[k]];
688 if (*p != 0)
689 if (rjpeg__jpeg_get_bit(j))
690 if ((*p & bit)==0)
691 {
692 if (*p > 0)
693 *p += bit;
694 else
695 *p -= bit;
696 }
697 }
698 }
699 else
700 {
701 k = j->spec_start;
702 do
703 {
704 int r,s;
705 int rs = rjpeg__jpeg_huff_decode(j, hac);
706
707 /* Bad huffman code. Corrupt JPEG? */
708 if (rs < 0)
709 return 0;
710
711 s = rs & 15;
712 r = rs >> 4;
713 if (s == 0)
714 {
715 if (r < 15)
716 {
717 j->eob_run = (1 << r) - 1;
718 if (r)
719 j->eob_run += rjpeg__jpeg_get_bits(j, r);
720 r = 64; /* force end of block */
721 }
722 else
723 {
724 /* r=15 s=0 should write 16 0s, so we just do
725 * a run of 15 0s and then write s (which is 0),
726 * so we don't have to do anything special here */
727 }
728 }
729 else
730 {
731 /* Bad huffman code. Corrupt JPEG? */
732 if (s != 1)
733 return 0;
734
735 /* sign bit */
736 if (rjpeg__jpeg_get_bit(j))
737 s = bit;
738 else
739 s = -bit;
740 }
741
742 /* advance by r */
743 while (k <= j->spec_end)
744 {
745 short *p = &data[rjpeg__jpeg_dezigzag[k++]];
746 if (*p != 0)
747 {
748 if (rjpeg__jpeg_get_bit(j))
749 if ((*p & bit)==0)
750 {
751 if (*p > 0)
752 *p += bit;
753 else
754 *p -= bit;
755 }
756 }
757 else
758 {
759 if (r == 0)
760 {
761 *p = (short) s;
762 break;
763 }
764 --r;
765 }
766 }
767 } while (k <= j->spec_end);
768 }
769 }
770 return 1;
771 }
772
773 /* take a -128..127 value and rjpeg__clamp it and convert to 0..255 */
rjpeg__clamp(int x)774 static INLINE uint8_t rjpeg__clamp(int x)
775 {
776 /* trick to use a single test to catch both cases */
777 if ((unsigned int) x > 255)
778 return 255;
779 return (uint8_t) x;
780 }
781
782 /* derived from jidctint -- DCT_ISLOW */
783 #define RJPEG__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
784 int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
785 p2 = s2; \
786 p3 = s6; \
787 p1 = (p2+p3) * rjpeg__f2f(0.5411961f); \
788 t2 = p1 + p3*rjpeg__f2f(-1.847759065f); \
789 t3 = p1 + p2*rjpeg__f2f( 0.765366865f); \
790 p2 = s0; \
791 p3 = s4; \
792 t0 = rjpeg__fsh(p2+p3); \
793 t1 = rjpeg__fsh(p2-p3); \
794 x0 = t0+t3; \
795 x3 = t0-t3; \
796 x1 = t1+t2; \
797 x2 = t1-t2; \
798 t0 = s7; \
799 t1 = s5; \
800 t2 = s3; \
801 t3 = s1; \
802 p3 = t0+t2; \
803 p4 = t1+t3; \
804 p1 = t0+t3; \
805 p2 = t1+t2; \
806 p5 = (p3+p4)*rjpeg__f2f( 1.175875602f); \
807 t0 = t0*rjpeg__f2f( 0.298631336f); \
808 t1 = t1*rjpeg__f2f( 2.053119869f); \
809 t2 = t2*rjpeg__f2f( 3.072711026f); \
810 t3 = t3*rjpeg__f2f( 1.501321110f); \
811 p1 = p5 + p1*rjpeg__f2f(-0.899976223f); \
812 p2 = p5 + p2*rjpeg__f2f(-2.562915447f); \
813 p3 = p3*rjpeg__f2f(-1.961570560f); \
814 p4 = p4*rjpeg__f2f(-0.390180644f); \
815 t3 += p1+p4; \
816 t2 += p2+p3; \
817 t1 += p2+p4; \
818 t0 += p1+p3;
819
rjpeg__idct_block(uint8_t * out,int out_stride,short data[64])820 static void rjpeg__idct_block(uint8_t *out, int out_stride, short data[64])
821 {
822 int i,val[64],*v=val;
823 uint8_t *o = NULL;
824 int16_t *d = data;
825
826 /* columns */
827 for (i=0; i < 8; ++i,++d, ++v)
828 {
829 /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
830 if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
831 && d[40]==0 && d[48]==0 && d[56]==0)
832 {
833 /* no shortcut 0 seconds
834 * (1|2|3|4|5|6|7)==0 0 seconds
835 * all separate -0.047 seconds
836 * 1 && 2|3 && 4|5 && 6|7: -0.047 seconds */
837 int dcterm = d[0] << 2;
838 v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
839 }
840 else
841 {
842 RJPEG__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
843 /* constants scaled things up by 1<<12; let's bring them back
844 * down, but keep 2 extra bits of precision */
845 x0 += 512; x1 += 512; x2 += 512; x3 += 512;
846 v[ 0] = (x0+t3) >> 10;
847 v[56] = (x0-t3) >> 10;
848 v[ 8] = (x1+t2) >> 10;
849 v[48] = (x1-t2) >> 10;
850 v[16] = (x2+t1) >> 10;
851 v[40] = (x2-t1) >> 10;
852 v[24] = (x3+t0) >> 10;
853 v[32] = (x3-t0) >> 10;
854 }
855 }
856
857 for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
858 {
859 /* no fast case since the first 1D IDCT spread components out */
860 RJPEG__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
861 /* constants scaled things up by 1<<12, plus we had 1<<2 from first
862 * loop, plus horizontal and vertical each scale by sqrt(8) so together
863 * we've got an extra 1<<3, so 1<<17 total we need to remove.
864 * so we want to round that, which means adding 0.5 * 1<<17,
865 * aka 65536. Also, we'll end up with -128 to 127 that we want
866 * to encode as 0..255 by adding 128, so we'll add that before the shift */
867 x0 += 65536 + (128<<17);
868 x1 += 65536 + (128<<17);
869 x2 += 65536 + (128<<17);
870 x3 += 65536 + (128<<17);
871 /* tried computing the shifts into temps, or'ing the temps to see
872 * if any were out of range, but that was slower */
873 o[0] = rjpeg__clamp((x0+t3) >> 17);
874 o[7] = rjpeg__clamp((x0-t3) >> 17);
875 o[1] = rjpeg__clamp((x1+t2) >> 17);
876 o[6] = rjpeg__clamp((x1-t2) >> 17);
877 o[2] = rjpeg__clamp((x2+t1) >> 17);
878 o[5] = rjpeg__clamp((x2-t1) >> 17);
879 o[3] = rjpeg__clamp((x3+t0) >> 17);
880 o[4] = rjpeg__clamp((x3-t0) >> 17);
881 }
882 }
883
884 #if defined(__SSE2__)
885 /* sse2 integer IDCT. not the fastest possible implementation but it
886 * produces bit-identical results to the generic C version so it's
887 * fully "transparent".
888 */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])889 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
890 {
891 /* This is constructed to match our regular (generic) integer IDCT exactly. */
892 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
893 __m128i tmp;
894
895 /* dot product constant: even elems=x, odd elems=y */
896 #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
897
898 /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
899 * out(1) = c1[even]*x + c1[odd]*y
900 */
901 #define dct_rot(out0,out1, x,y,c0,c1) \
902 __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
903 __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
904 __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
905 __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
906 __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
907 __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
908
909 /* out = in << 12 (in 16-bit, out 32-bit) */
910 #define dct_widen(out, in) \
911 __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
912 __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
913
914 /* wide add */
915 #define dct_wadd(out, a, b) \
916 __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
917 __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
918
919 /* wide sub */
920 #define dct_wsub(out, a, b) \
921 __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
922 __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
923
924 /* butterfly a/b, add bias, then shift by "s" and pack */
925 #define dct_bfly32o(out0, out1, a,b,bias,s) \
926 { \
927 __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
928 __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
929 dct_wadd(sum, abiased, b); \
930 dct_wsub(dif, abiased, b); \
931 out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
932 out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
933 }
934
935 /* 8-bit interleave step (for transposes) */
936 #define dct_interleave8(a, b) \
937 tmp = a; \
938 a = _mm_unpacklo_epi8(a, b); \
939 b = _mm_unpackhi_epi8(tmp, b)
940
941 /* 16-bit interleave step (for transposes) */
942 #define dct_interleave16(a, b) \
943 tmp = a; \
944 a = _mm_unpacklo_epi16(a, b); \
945 b = _mm_unpackhi_epi16(tmp, b)
946
947 #define dct_pass(bias,shift) \
948 { \
949 /* even part */ \
950 dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
951 __m128i sum04 = _mm_add_epi16(row0, row4); \
952 __m128i dif04 = _mm_sub_epi16(row0, row4); \
953 dct_widen(t0e, sum04); \
954 dct_widen(t1e, dif04); \
955 dct_wadd(x0, t0e, t3e); \
956 dct_wsub(x3, t0e, t3e); \
957 dct_wadd(x1, t1e, t2e); \
958 dct_wsub(x2, t1e, t2e); \
959 /* odd part */ \
960 dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
961 dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
962 __m128i sum17 = _mm_add_epi16(row1, row7); \
963 __m128i sum35 = _mm_add_epi16(row3, row5); \
964 dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
965 dct_wadd(x4, y0o, y4o); \
966 dct_wadd(x5, y1o, y5o); \
967 dct_wadd(x6, y2o, y5o); \
968 dct_wadd(x7, y3o, y4o); \
969 dct_bfly32o(row0,row7, x0,x7,bias,shift); \
970 dct_bfly32o(row1,row6, x1,x6,bias,shift); \
971 dct_bfly32o(row2,row5, x2,x5,bias,shift); \
972 dct_bfly32o(row3,row4, x3,x4,bias,shift); \
973 }
974
975 __m128i rot0_0 = dct_const(rjpeg__f2f(0.5411961f), rjpeg__f2f(0.5411961f) + rjpeg__f2f(-1.847759065f));
976 __m128i rot0_1 = dct_const(rjpeg__f2f(0.5411961f) + rjpeg__f2f( 0.765366865f), rjpeg__f2f(0.5411961f));
977 __m128i rot1_0 = dct_const(rjpeg__f2f(1.175875602f) + rjpeg__f2f(-0.899976223f), rjpeg__f2f(1.175875602f));
978 __m128i rot1_1 = dct_const(rjpeg__f2f(1.175875602f), rjpeg__f2f(1.175875602f) + rjpeg__f2f(-2.562915447f));
979 __m128i rot2_0 = dct_const(rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 0.298631336f), rjpeg__f2f(-1.961570560f));
980 __m128i rot2_1 = dct_const(rjpeg__f2f(-1.961570560f), rjpeg__f2f(-1.961570560f) + rjpeg__f2f( 3.072711026f));
981 __m128i rot3_0 = dct_const(rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 2.053119869f), rjpeg__f2f(-0.390180644f));
982 __m128i rot3_1 = dct_const(rjpeg__f2f(-0.390180644f), rjpeg__f2f(-0.390180644f) + rjpeg__f2f( 1.501321110f));
983
984 /* rounding biases in column/row passes, see rjpeg__idct_block for explanation. */
985 __m128i bias_0 = _mm_set1_epi32(512);
986 __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
987
988 /* load */
989 row0 = _mm_load_si128((const __m128i *) (data + 0*8));
990 row1 = _mm_load_si128((const __m128i *) (data + 1*8));
991 row2 = _mm_load_si128((const __m128i *) (data + 2*8));
992 row3 = _mm_load_si128((const __m128i *) (data + 3*8));
993 row4 = _mm_load_si128((const __m128i *) (data + 4*8));
994 row5 = _mm_load_si128((const __m128i *) (data + 5*8));
995 row6 = _mm_load_si128((const __m128i *) (data + 6*8));
996 row7 = _mm_load_si128((const __m128i *) (data + 7*8));
997
998 /* column pass */
999 dct_pass(bias_0, 10);
1000
1001 {
1002 /* 16bit 8x8 transpose pass 1 */
1003 dct_interleave16(row0, row4);
1004 dct_interleave16(row1, row5);
1005 dct_interleave16(row2, row6);
1006 dct_interleave16(row3, row7);
1007
1008 /* transpose pass 2 */
1009 dct_interleave16(row0, row2);
1010 dct_interleave16(row1, row3);
1011 dct_interleave16(row4, row6);
1012 dct_interleave16(row5, row7);
1013
1014 /* transpose pass 3 */
1015 dct_interleave16(row0, row1);
1016 dct_interleave16(row2, row3);
1017 dct_interleave16(row4, row5);
1018 dct_interleave16(row6, row7);
1019 }
1020
1021 /* row pass */
1022 dct_pass(bias_1, 17);
1023
1024 {
1025 /* pack */
1026 __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1027 __m128i p1 = _mm_packus_epi16(row2, row3);
1028 __m128i p2 = _mm_packus_epi16(row4, row5);
1029 __m128i p3 = _mm_packus_epi16(row6, row7);
1030
1031 /* 8bit 8x8 transpose pass 1 */
1032 dct_interleave8(p0, p2); /* a0e0a1e1... */
1033 dct_interleave8(p1, p3); /* c0g0c1g1... */
1034
1035 /* transpose pass 2 */
1036 dct_interleave8(p0, p1); /* a0c0e0g0... */
1037 dct_interleave8(p2, p3); /* b0d0f0h0... */
1038
1039 /* transpose pass 3 */
1040 dct_interleave8(p0, p2); /* a0b0c0d0... */
1041 dct_interleave8(p1, p3); /* a4b4c4d4... */
1042
1043 /* store */
1044 _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1045 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1046 _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1047 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1048 _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1049 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1050 _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1051 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1052 }
1053
1054 #undef dct_const
1055 #undef dct_rot
1056 #undef dct_widen
1057 #undef dct_wadd
1058 #undef dct_wsub
1059 #undef dct_bfly32o
1060 #undef dct_interleave8
1061 #undef dct_interleave16
1062 #undef dct_pass
1063 }
1064
1065 #endif
1066
1067 #ifdef RJPEG_NEON
1068
1069 /* NEON integer IDCT. should produce bit-identical
1070 * results to the generic C version. */
rjpeg__idct_simd(uint8_t * out,int out_stride,short data[64])1071 static void rjpeg__idct_simd(uint8_t *out, int out_stride, short data[64])
1072 {
1073 int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1074
1075 int16x4_t rot0_0 = vdup_n_s16(rjpeg__f2f(0.5411961f));
1076 int16x4_t rot0_1 = vdup_n_s16(rjpeg__f2f(-1.847759065f));
1077 int16x4_t rot0_2 = vdup_n_s16(rjpeg__f2f( 0.765366865f));
1078 int16x4_t rot1_0 = vdup_n_s16(rjpeg__f2f( 1.175875602f));
1079 int16x4_t rot1_1 = vdup_n_s16(rjpeg__f2f(-0.899976223f));
1080 int16x4_t rot1_2 = vdup_n_s16(rjpeg__f2f(-2.562915447f));
1081 int16x4_t rot2_0 = vdup_n_s16(rjpeg__f2f(-1.961570560f));
1082 int16x4_t rot2_1 = vdup_n_s16(rjpeg__f2f(-0.390180644f));
1083 int16x4_t rot3_0 = vdup_n_s16(rjpeg__f2f( 0.298631336f));
1084 int16x4_t rot3_1 = vdup_n_s16(rjpeg__f2f( 2.053119869f));
1085 int16x4_t rot3_2 = vdup_n_s16(rjpeg__f2f( 3.072711026f));
1086 int16x4_t rot3_3 = vdup_n_s16(rjpeg__f2f( 1.501321110f));
1087
1088 #define dct_long_mul(out, inq, coeff) \
1089 int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1090 int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1091
1092 #define dct_long_mac(out, acc, inq, coeff) \
1093 int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1094 int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1095
1096 #define dct_widen(out, inq) \
1097 int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1098 int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1099
1100 /* wide add */
1101 #define dct_wadd(out, a, b) \
1102 int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1103 int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1104
1105 /* wide sub */
1106 #define dct_wsub(out, a, b) \
1107 int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1108 int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1109
1110 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1111 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1112 { \
1113 dct_wadd(sum, a, b); \
1114 dct_wsub(dif, a, b); \
1115 out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1116 out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1117 }
1118
1119 #define dct_pass(shiftop, shift) \
1120 { \
1121 /* even part */ \
1122 int16x8_t sum26 = vaddq_s16(row2, row6); \
1123 dct_long_mul(p1e, sum26, rot0_0); \
1124 dct_long_mac(t2e, p1e, row6, rot0_1); \
1125 dct_long_mac(t3e, p1e, row2, rot0_2); \
1126 int16x8_t sum04 = vaddq_s16(row0, row4); \
1127 int16x8_t dif04 = vsubq_s16(row0, row4); \
1128 dct_widen(t0e, sum04); \
1129 dct_widen(t1e, dif04); \
1130 dct_wadd(x0, t0e, t3e); \
1131 dct_wsub(x3, t0e, t3e); \
1132 dct_wadd(x1, t1e, t2e); \
1133 dct_wsub(x2, t1e, t2e); \
1134 /* odd part */ \
1135 int16x8_t sum15 = vaddq_s16(row1, row5); \
1136 int16x8_t sum17 = vaddq_s16(row1, row7); \
1137 int16x8_t sum35 = vaddq_s16(row3, row5); \
1138 int16x8_t sum37 = vaddq_s16(row3, row7); \
1139 int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1140 dct_long_mul(p5o, sumodd, rot1_0); \
1141 dct_long_mac(p1o, p5o, sum17, rot1_1); \
1142 dct_long_mac(p2o, p5o, sum35, rot1_2); \
1143 dct_long_mul(p3o, sum37, rot2_0); \
1144 dct_long_mul(p4o, sum15, rot2_1); \
1145 dct_wadd(sump13o, p1o, p3o); \
1146 dct_wadd(sump24o, p2o, p4o); \
1147 dct_wadd(sump23o, p2o, p3o); \
1148 dct_wadd(sump14o, p1o, p4o); \
1149 dct_long_mac(x4, sump13o, row7, rot3_0); \
1150 dct_long_mac(x5, sump24o, row5, rot3_1); \
1151 dct_long_mac(x6, sump23o, row3, rot3_2); \
1152 dct_long_mac(x7, sump14o, row1, rot3_3); \
1153 dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1154 dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1155 dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1156 dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1157 }
1158
1159 /* load */
1160 row0 = vld1q_s16(data + 0*8);
1161 row1 = vld1q_s16(data + 1*8);
1162 row2 = vld1q_s16(data + 2*8);
1163 row3 = vld1q_s16(data + 3*8);
1164 row4 = vld1q_s16(data + 4*8);
1165 row5 = vld1q_s16(data + 5*8);
1166 row6 = vld1q_s16(data + 6*8);
1167 row7 = vld1q_s16(data + 7*8);
1168
1169 /* add DC bias */
1170 row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1171
1172 /* column pass */
1173 dct_pass(vrshrn_n_s32, 10);
1174
1175 /* 16bit 8x8 transpose */
1176 {
1177 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1178 * whether compilers actually get this is another story, sadly. */
1179 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1180 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1181 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1182
1183 /* pass 1 */
1184 dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1185 dct_trn16(row2, row3);
1186 dct_trn16(row4, row5);
1187 dct_trn16(row6, row7);
1188
1189 /* pass 2 */
1190 dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1191 dct_trn32(row1, row3);
1192 dct_trn32(row4, row6);
1193 dct_trn32(row5, row7);
1194
1195 /* pass 3 */
1196 dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1197 dct_trn64(row1, row5);
1198 dct_trn64(row2, row6);
1199 dct_trn64(row3, row7);
1200
1201 #undef dct_trn16
1202 #undef dct_trn32
1203 #undef dct_trn64
1204 }
1205
1206 /* row pass
1207 * vrshrn_n_s32 only supports shifts up to 16, we need
1208 * 17. so do a non-rounding shift of 16 first then follow
1209 * up with a rounding shift by 1. */
1210 dct_pass(vshrn_n_s32, 16);
1211
1212 {
1213 /* pack and round */
1214 uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1215 uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1216 uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1217 uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1218 uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1219 uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1220 uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1221 uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1222
1223 /* again, these can translate into one instruction, but often don't. */
1224 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1225 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1226 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1227
1228 /* sadly can't use interleaved stores here since we only write
1229 * 8 bytes to each scan line! */
1230
1231 /* 8x8 8-bit transpose pass 1 */
1232 dct_trn8_8(p0, p1);
1233 dct_trn8_8(p2, p3);
1234 dct_trn8_8(p4, p5);
1235 dct_trn8_8(p6, p7);
1236
1237 /* pass 2 */
1238 dct_trn8_16(p0, p2);
1239 dct_trn8_16(p1, p3);
1240 dct_trn8_16(p4, p6);
1241 dct_trn8_16(p5, p7);
1242
1243 /* pass 3 */
1244 dct_trn8_32(p0, p4);
1245 dct_trn8_32(p1, p5);
1246 dct_trn8_32(p2, p6);
1247 dct_trn8_32(p3, p7);
1248
1249 /* store */
1250 vst1_u8(out, p0); out += out_stride;
1251 vst1_u8(out, p1); out += out_stride;
1252 vst1_u8(out, p2); out += out_stride;
1253 vst1_u8(out, p3); out += out_stride;
1254 vst1_u8(out, p4); out += out_stride;
1255 vst1_u8(out, p5); out += out_stride;
1256 vst1_u8(out, p6); out += out_stride;
1257 vst1_u8(out, p7);
1258
1259 #undef dct_trn8_8
1260 #undef dct_trn8_16
1261 #undef dct_trn8_32
1262 }
1263
1264 #undef dct_long_mul
1265 #undef dct_long_mac
1266 #undef dct_widen
1267 #undef dct_wadd
1268 #undef dct_wsub
1269 #undef dct_bfly32o
1270 #undef dct_pass
1271 }
1272
1273 #endif /* RJPEG_NEON */
1274
rjpeg__get_marker(rjpeg__jpeg * j)1275 static uint8_t rjpeg__get_marker(rjpeg__jpeg *j)
1276 {
1277 uint8_t x;
1278
1279 if (j->marker != RJPEG__MARKER_NONE)
1280 {
1281 x = j->marker;
1282 j->marker = RJPEG__MARKER_NONE;
1283 return x;
1284 }
1285
1286 x = rjpeg__get8(j->s);
1287 if (x != 0xff)
1288 return RJPEG__MARKER_NONE;
1289 while (x == 0xff)
1290 x = rjpeg__get8(j->s);
1291 return x;
1292 }
1293
1294 /* after a restart interval, rjpeg__jpeg_reset the entropy decoder and
1295 * the dc prediction
1296 */
rjpeg__jpeg_reset(rjpeg__jpeg * j)1297 static void rjpeg__jpeg_reset(rjpeg__jpeg *j)
1298 {
1299 j->code_bits = 0;
1300 j->code_buffer = 0;
1301 j->nomore = 0;
1302 j->img_comp[0].dc_pred = 0;
1303 j->img_comp[1].dc_pred = 0;
1304 j->img_comp[2].dc_pred = 0;
1305 j->marker = RJPEG__MARKER_NONE;
1306 j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1307 j->eob_run = 0;
1308
1309 /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1310 * since we don't even allow 1<<30 pixels */
1311 }
1312
rjpeg__parse_entropy_coded_data(rjpeg__jpeg * z)1313 static int rjpeg__parse_entropy_coded_data(rjpeg__jpeg *z)
1314 {
1315 rjpeg__jpeg_reset(z);
1316
1317 if (z->scan_n == 1)
1318 {
1319 int i,j;
1320 int n = z->order[0];
1321 int w = (z->img_comp[n].x+7) >> 3;
1322 int h = (z->img_comp[n].y+7) >> 3;
1323
1324 /* non-interleaved data, we just need to process one block at a time,
1325 * in trivial scanline order
1326 * number of blocks to do just depends on how many actual "pixels" this
1327 * component has, independent of interleaved MCU blocking and such */
1328
1329 if (z->progressive)
1330 {
1331 for (j=0; j < h; ++j)
1332 {
1333 for (i=0; i < w; ++i)
1334 {
1335 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1336
1337 if (z->spec_start == 0)
1338 {
1339 if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1340 return 0;
1341 }
1342 else
1343 {
1344 int ha = z->img_comp[n].ha;
1345 if (!rjpeg__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1346 return 0;
1347 }
1348
1349 /* every data block is an MCU, so countdown the restart interval */
1350 if (--z->todo <= 0)
1351 {
1352 if (z->code_bits < 24)
1353 rjpeg__grow_buffer_unsafe(z);
1354
1355 if (!RJPEG__RESTART(z->marker))
1356 return 1;
1357 rjpeg__jpeg_reset(z);
1358 }
1359 }
1360 }
1361 }
1362 else
1363 {
1364 RJPEG_SIMD_ALIGN(short, data[64]);
1365
1366 for (j=0; j < h; ++j)
1367 {
1368 for (i=0; i < w; ++i)
1369 {
1370 int ha = z->img_comp[n].ha;
1371 if (!rjpeg__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1372 z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1373 return 0;
1374
1375 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1376 z->img_comp[n].w2, data);
1377
1378 /* every data block is an MCU, so countdown the restart interval */
1379 if (--z->todo <= 0)
1380 {
1381 if (z->code_bits < 24)
1382 rjpeg__grow_buffer_unsafe(z);
1383
1384 /* if it's NOT a restart, then just bail,
1385 * so we get corrupt data rather than no data */
1386 if (!RJPEG__RESTART(z->marker))
1387 return 1;
1388 rjpeg__jpeg_reset(z);
1389 }
1390 }
1391 }
1392 }
1393 }
1394 else
1395 {
1396 /* interleaved */
1397 int i,j,k,x,y;
1398
1399 if (z->progressive)
1400 {
1401 for (j=0; j < z->img_mcu_y; ++j)
1402 {
1403 for (i=0; i < z->img_mcu_x; ++i)
1404 {
1405 /* scan an interleaved MCU... process scan_n components in order */
1406 for (k=0; k < z->scan_n; ++k)
1407 {
1408 int n = z->order[k];
1409 /* scan out an MCU's worth of this component; that's just determined
1410 * by the basic H and V specified for the component */
1411 for (y=0; y < z->img_comp[n].v; ++y)
1412 {
1413 for (x=0; x < z->img_comp[n].h; ++x)
1414 {
1415 int x2 = (i*z->img_comp[n].h + x);
1416 int y2 = (j*z->img_comp[n].v + y);
1417 short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1418 if (!rjpeg__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1419 return 0;
1420 }
1421 }
1422 }
1423
1424 /* after all interleaved components, that's an interleaved MCU,
1425 * so now count down the restart interval */
1426 if (--z->todo <= 0)
1427 {
1428 if (z->code_bits < 24)
1429 rjpeg__grow_buffer_unsafe(z);
1430 if (!RJPEG__RESTART(z->marker))
1431 return 1;
1432 rjpeg__jpeg_reset(z);
1433 }
1434 }
1435 }
1436 }
1437 else
1438 {
1439 RJPEG_SIMD_ALIGN(short, data[64]);
1440
1441 for (j=0; j < z->img_mcu_y; ++j)
1442 {
1443 for (i=0; i < z->img_mcu_x; ++i)
1444 {
1445 /* scan an interleaved MCU... process scan_n components in order */
1446 for (k=0; k < z->scan_n; ++k)
1447 {
1448 int n = z->order[k];
1449 /* scan out an MCU's worth of this component; that's just determined
1450 * by the basic H and V specified for the component */
1451 for (y=0; y < z->img_comp[n].v; ++y)
1452 {
1453 for (x=0; x < z->img_comp[n].h; ++x)
1454 {
1455 int x2 = (i*z->img_comp[n].h + x)*8;
1456 int y2 = (j*z->img_comp[n].v + y)*8;
1457 int ha = z->img_comp[n].ha;
1458
1459 if (!rjpeg__jpeg_decode_block(z, data,
1460 z->huff_dc+z->img_comp[n].hd,
1461 z->huff_ac+ha, z->fast_ac[ha],
1462 n, z->dequant[z->img_comp[n].tq]))
1463 return 0;
1464
1465 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1466 z->img_comp[n].w2, data);
1467 }
1468 }
1469 }
1470
1471 /* after all interleaved components, that's an interleaved MCU,
1472 * so now count down the restart interval */
1473 if (--z->todo <= 0)
1474 {
1475 if (z->code_bits < 24)
1476 rjpeg__grow_buffer_unsafe(z);
1477 if (!RJPEG__RESTART(z->marker))
1478 return 1;
1479 rjpeg__jpeg_reset(z);
1480 }
1481 }
1482 }
1483 }
1484 }
1485
1486 return 1;
1487 }
1488
rjpeg__jpeg_dequantize(short * data,uint8_t * dequant)1489 static void rjpeg__jpeg_dequantize(short *data, uint8_t *dequant)
1490 {
1491 int i;
1492 for (i=0; i < 64; ++i)
1493 data[i] *= dequant[i];
1494 }
1495
rjpeg__jpeg_finish(rjpeg__jpeg * z)1496 static void rjpeg__jpeg_finish(rjpeg__jpeg *z)
1497 {
1498 int i,j,n;
1499
1500 if (!z->progressive)
1501 return;
1502
1503 /* dequantize and IDCT the data */
1504 for (n=0; n < z->s->img_n; ++n)
1505 {
1506 int w = (z->img_comp[n].x+7) >> 3;
1507 int h = (z->img_comp[n].y+7) >> 3;
1508 for (j=0; j < h; ++j)
1509 {
1510 for (i=0; i < w; ++i)
1511 {
1512 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1513 rjpeg__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1514 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1515 z->img_comp[n].w2, data);
1516 }
1517 }
1518 }
1519 }
1520
rjpeg__process_marker(rjpeg__jpeg * z,int m)1521 static int rjpeg__process_marker(rjpeg__jpeg *z, int m)
1522 {
1523 int L;
1524 switch (m)
1525 {
1526 case RJPEG__MARKER_NONE: /* no marker found */
1527 /* Expected marker. Corrupt JPEG? */
1528 return 0;
1529
1530 case 0xDD: /* DRI - specify restart interval */
1531
1532 /* Bad DRI length. Corrupt JPEG? */
1533 if (RJPEG__GET16BE(z->s) != 4)
1534 return 0;
1535
1536 z->restart_interval = RJPEG__GET16BE(z->s);
1537 return 1;
1538
1539 case 0xDB: /* DQT - define quantization table */
1540 L = RJPEG__GET16BE(z->s)-2;
1541 while (L > 0)
1542 {
1543 int q = rjpeg__get8(z->s);
1544 int p = q >> 4;
1545 int t = q & 15,i;
1546
1547 /* Bad DQT type. Corrupt JPEG? */
1548 if (p != 0)
1549 return 0;
1550
1551 /* Bad DQT table. Corrupt JPEG? */
1552 if (t > 3)
1553 return 0;
1554
1555 for (i=0; i < 64; ++i)
1556 z->dequant[t][rjpeg__jpeg_dezigzag[i]] = rjpeg__get8(z->s);
1557 L -= 65;
1558 }
1559 return L==0;
1560
1561 case 0xC4: /* DHT - define huffman table */
1562 L = RJPEG__GET16BE(z->s)-2;
1563 while (L > 0)
1564 {
1565 int sizes[16],i,n=0;
1566 uint8_t *v = NULL;
1567 int q = rjpeg__get8(z->s);
1568 int tc = q >> 4;
1569 int th = q & 15;
1570
1571 /* Bad DHT header. Corrupt JPEG? */
1572 if (tc > 1 || th > 3)
1573 return 0;
1574
1575 for (i=0; i < 16; ++i)
1576 {
1577 sizes[i] = rjpeg__get8(z->s);
1578 n += sizes[i];
1579 }
1580 L -= 17;
1581
1582 if (tc == 0)
1583 {
1584 if (!rjpeg__build_huffman(z->huff_dc+th, sizes))
1585 return 0;
1586 v = z->huff_dc[th].values;
1587 }
1588 else
1589 {
1590 if (!rjpeg__build_huffman(z->huff_ac+th, sizes))
1591 return 0;
1592 v = z->huff_ac[th].values;
1593 }
1594 for (i=0; i < n; ++i)
1595 v[i] = rjpeg__get8(z->s);
1596 if (tc != 0)
1597 rjpeg__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1598 L -= n;
1599 }
1600 return L==0;
1601 }
1602
1603 /* check for comment block or APP blocks */
1604 if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1605 {
1606 int n = RJPEG__GET16BE(z->s)-2;
1607
1608 if (n < 0)
1609 z->s->img_buffer = z->s->img_buffer_end;
1610 else
1611 z->s->img_buffer += n;
1612
1613 return 1;
1614 }
1615 return 0;
1616 }
1617
1618 /* after we see SOS */
rjpeg__process_scan_header(rjpeg__jpeg * z)1619 static int rjpeg__process_scan_header(rjpeg__jpeg *z)
1620 {
1621 int i;
1622 int aa;
1623 int Ls = RJPEG__GET16BE(z->s);
1624
1625 z->scan_n = rjpeg__get8(z->s);
1626
1627 /* Bad SOS component count. Corrupt JPEG? */
1628 if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1629 return 0;
1630
1631 /* Bad SOS length. Corrupt JPEG? */
1632 if (Ls != 6+2*z->scan_n)
1633 return 0;
1634
1635 for (i=0; i < z->scan_n; ++i)
1636 {
1637 int which;
1638 int id = rjpeg__get8(z->s);
1639 int q = rjpeg__get8(z->s);
1640
1641 for (which = 0; which < z->s->img_n; ++which)
1642 if (z->img_comp[which].id == id)
1643 break;
1644 if (which == z->s->img_n)
1645 return 0; /* no match */
1646
1647 /* Bad DC huff. Corrupt JPEG? */
1648 z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3)
1649 return 0;
1650
1651 /* Bad AC huff. Corrupt JPEG? */
1652 z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3)
1653 return 0;
1654
1655 z->order[i] = which;
1656 }
1657
1658 z->spec_start = rjpeg__get8(z->s);
1659 z->spec_end = rjpeg__get8(z->s); /* should be 63, but might be 0 */
1660 aa = rjpeg__get8(z->s);
1661 z->succ_high = (aa >> 4);
1662 z->succ_low = (aa & 15);
1663
1664 if (z->progressive)
1665 {
1666 /* Bad SOS. Corrupt JPEG? */
1667 if ( z->spec_start > 63 ||
1668 z->spec_end > 63 ||
1669 z->spec_start > z->spec_end ||
1670 z->succ_high > 13 ||
1671 z->succ_low > 13)
1672 return 0;
1673 }
1674 else
1675 {
1676 /* Bad SOS. Corrupt JPEG? */
1677 if (z->spec_start != 0)
1678 return 0;
1679 if (z->succ_high != 0 || z->succ_low != 0)
1680 return 0;
1681
1682 z->spec_end = 63;
1683 }
1684
1685 return 1;
1686 }
1687
rjpeg__process_frame_header(rjpeg__jpeg * z,int scan)1688 static int rjpeg__process_frame_header(rjpeg__jpeg *z, int scan)
1689 {
1690 rjpeg__context *s = z->s;
1691 int Lf,p,i,q, h_max=1,v_max=1,c;
1692 Lf = RJPEG__GET16BE(s);
1693
1694 /* JPEG */
1695
1696 /* Bad SOF len. Corrupt JPEG? */
1697 if (Lf < 11)
1698 return 0;
1699
1700 p = rjpeg__get8(s);
1701
1702 /* JPEG baseline */
1703
1704 /* Only 8-bit. JPEG format not supported? */
1705 if (p != 8)
1706 return 0;
1707
1708 s->img_y = RJPEG__GET16BE(s);
1709
1710 /* Legal, but we don't handle it--but neither does IJG */
1711
1712 /* No header height, JPEG format not supported? */
1713 if (s->img_y == 0)
1714 return 0;
1715
1716 s->img_x = RJPEG__GET16BE(s);
1717
1718 /* No header width. Corrupt JPEG? */
1719 if (s->img_x == 0)
1720 return 0;
1721
1722 c = rjpeg__get8(s);
1723
1724 /* JFIF requires */
1725
1726 /* Bad component count. Corrupt JPEG? */
1727 if (c != 3 && c != 1)
1728 return 0;
1729
1730 s->img_n = c;
1731
1732 for (i=0; i < c; ++i)
1733 {
1734 z->img_comp[i].data = NULL;
1735 z->img_comp[i].linebuf = NULL;
1736 }
1737
1738 /* Bad SOF length. Corrupt JPEG? */
1739 if (Lf != 8+3*s->img_n)
1740 return 0;
1741
1742 for (i=0; i < s->img_n; ++i)
1743 {
1744 z->img_comp[i].id = rjpeg__get8(s);
1745 if (z->img_comp[i].id != i+1) /* JFIF requires */
1746 if (z->img_comp[i].id != i) /* some version of jpegtran outputs non-JFIF-compliant files! */
1747 return 0;
1748
1749 q = rjpeg__get8(s);
1750 z->img_comp[i].h = (q >> 4);
1751
1752 /* Bad H. Corrupt JPEG? */
1753 if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1754 return 0;
1755
1756 z->img_comp[i].v = q & 15;
1757
1758 /* Bad V. Corrupt JPEG? */
1759 if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1760 return 0;
1761
1762 z->img_comp[i].tq = rjpeg__get8(s);
1763
1764 /* Bad TQ. Corrupt JPEG? */
1765 if (z->img_comp[i].tq > 3)
1766 return 0;
1767 }
1768
1769 if (scan != RJPEG_SCAN_LOAD)
1770 return 1;
1771
1772 /* Image too large to decode? */
1773 if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1774 return 0;
1775
1776 for (i=0; i < s->img_n; ++i)
1777 {
1778 if (z->img_comp[i].h > h_max)
1779 h_max = z->img_comp[i].h;
1780 if (z->img_comp[i].v > v_max)
1781 v_max = z->img_comp[i].v;
1782 }
1783
1784 /* compute interleaved MCU info */
1785 z->img_h_max = h_max;
1786 z->img_v_max = v_max;
1787 z->img_mcu_w = h_max * 8;
1788 z->img_mcu_h = v_max * 8;
1789 z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1790 z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1791
1792 if (z->progressive)
1793 {
1794 for (i=0; i < s->img_n; ++i)
1795 {
1796 /* number of effective pixels (e.g. for non-interleaved MCU) */
1797 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1798 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1799
1800 /* to simplify generation, we'll allocate enough memory to decode
1801 * the bogus oversized data from using interleaved MCUs and their
1802 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1803 * discard the extra data until colorspace conversion */
1804 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1805 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1806 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1807
1808 /* Out of memory? */
1809 if (!z->img_comp[i].raw_data)
1810 {
1811 for(--i; i >= 0; --i)
1812 {
1813 free(z->img_comp[i].raw_data);
1814 z->img_comp[i].data = NULL;
1815 }
1816
1817 return 0;
1818 }
1819
1820 /* align blocks for IDCT using MMX/SSE */
1821 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1822 z->img_comp[i].linebuf = NULL;
1823 z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1824 z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1825 z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1826 z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1827 z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1828 }
1829 }
1830 else
1831 {
1832 for (i=0; i < s->img_n; ++i)
1833 {
1834 /* number of effective pixels (e.g. for non-interleaved MCU) */
1835 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1836 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1837
1838 /* to simplify generation, we'll allocate enough memory to decode
1839 * the bogus oversized data from using interleaved MCUs and their
1840 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1841 * discard the extra data until colorspace conversion */
1842 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1843 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1844 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1845
1846 /* Out of memory? */
1847 if (!z->img_comp[i].raw_data)
1848 {
1849 for(--i; i >= 0; --i)
1850 {
1851 free(z->img_comp[i].raw_data);
1852 z->img_comp[i].data = NULL;
1853 }
1854 }
1855
1856 /* align blocks for IDCT using MMX/SSE */
1857 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1858 z->img_comp[i].linebuf = NULL;
1859 z->img_comp[i].coeff = 0;
1860 z->img_comp[i].raw_coeff = 0;
1861 }
1862 }
1863
1864 return 1;
1865 }
1866
rjpeg__decode_jpeg_header(rjpeg__jpeg * z,int scan)1867 static int rjpeg__decode_jpeg_header(rjpeg__jpeg *z, int scan)
1868 {
1869 int m;
1870 z->marker = RJPEG__MARKER_NONE; /* initialize cached marker to empty */
1871 m = rjpeg__get_marker(z);
1872
1873 /* No SOI. Corrupt JPEG? */
1874 if (m != JPEG_MARKER_SOI)
1875 return 0;
1876
1877 if (scan == RJPEG_SCAN_TYPE)
1878 return 1;
1879
1880 m = rjpeg__get_marker(z);
1881 while (!rjpeg__SOF(m))
1882 {
1883 if (!rjpeg__process_marker(z,m))
1884 return 0;
1885 m = rjpeg__get_marker(z);
1886 while (m == RJPEG__MARKER_NONE)
1887 {
1888 /* some files have extra padding after their blocks, so ok, we'll scan */
1889
1890 /* No SOF. Corrupt JPEG? */
1891 if (RJPEG__AT_EOF(z->s))
1892 return 0;
1893
1894 m = rjpeg__get_marker(z);
1895 }
1896 }
1897 z->progressive = rjpeg__SOF_progressive(m);
1898 if (!rjpeg__process_frame_header(z, scan))
1899 return 0;
1900 return 1;
1901 }
1902
1903 /* decode image to YCbCr format */
rjpeg__decode_jpeg_image(rjpeg__jpeg * j)1904 static int rjpeg__decode_jpeg_image(rjpeg__jpeg *j)
1905 {
1906 int m;
1907 for (m = 0; m < 4; m++)
1908 {
1909 j->img_comp[m].raw_data = NULL;
1910 j->img_comp[m].raw_coeff = NULL;
1911 }
1912 j->restart_interval = 0;
1913 if (!rjpeg__decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1914 return 0;
1915 m = rjpeg__get_marker(j);
1916
1917 while (m != JPEG_MARKER_EOI)
1918 {
1919 if (m == JPEG_MARKER_SOS)
1920 {
1921 if (!rjpeg__process_scan_header(j))
1922 return 0;
1923 if (!rjpeg__parse_entropy_coded_data(j))
1924 return 0;
1925
1926 if (j->marker == RJPEG__MARKER_NONE )
1927 {
1928 /* handle 0s at the end of image data from IP Kamera 9060 */
1929
1930 while (!RJPEG__AT_EOF(j->s))
1931 {
1932 int x = rjpeg__get8(j->s);
1933 if (x == 255)
1934 {
1935 j->marker = rjpeg__get8(j->s);
1936 break;
1937 }
1938 else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1939 return 0;
1940 }
1941
1942 /* if we reach eof without hitting a marker,
1943 * rjpeg__get_marker() below will fail and we'll eventually return 0 */
1944 }
1945 }
1946 else
1947 {
1948 if (!rjpeg__process_marker(j, m))
1949 return 0;
1950 }
1951 m = rjpeg__get_marker(j);
1952 }
1953
1954 if (j->progressive)
1955 rjpeg__jpeg_finish(j);
1956 return 1;
1957 }
1958
1959 /* static jfif-centered resampling (across block boundaries) */
1960
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1961 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1962 uint8_t *in_far, int w, int hs)
1963 {
1964 (void)out;
1965 (void)in_far;
1966 (void)w;
1967 (void)hs;
1968 return in_near;
1969 }
1970
rjpeg__resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1971 static uint8_t* rjpeg__resample_row_v_2(uint8_t *out, uint8_t *in_near,
1972 uint8_t *in_far, int w, int hs)
1973 {
1974 /* need to generate two samples vertically for every one in input */
1975 int i;
1976 (void)hs;
1977 for (i=0; i < w; ++i)
1978 out[i] = rjpeg__div4(3*in_near[i] + in_far[i] + 2);
1979 return out;
1980 }
1981
rjpeg__resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1982 static uint8_t* rjpeg__resample_row_h_2(uint8_t *out, uint8_t *in_near,
1983 uint8_t *in_far, int w, int hs)
1984 {
1985 /* need to generate two samples horizontally for every one in input */
1986 int i;
1987 uint8_t *input = in_near;
1988
1989 if (w == 1)
1990 {
1991 /* if only one sample, can't do any interpolation */
1992 out[0] = out[1] = input[0];
1993 return out;
1994 }
1995
1996 out[0] = input[0];
1997 out[1] = rjpeg__div4(input[0]*3 + input[1] + 2);
1998
1999 for (i=1; i < w-1; ++i)
2000 {
2001 int n = 3*input[i]+2;
2002 out[i*2+0] = rjpeg__div4(n+input[i-1]);
2003 out[i*2+1] = rjpeg__div4(n+input[i+1]);
2004 }
2005 out[i*2+0] = rjpeg__div4(input[w-2]*3 + input[w-1] + 2);
2006 out[i*2+1] = input[w-1];
2007
2008 (void)in_far;
2009 (void)hs;
2010
2011 return out;
2012 }
2013
rjpeg__resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2014 static uint8_t *rjpeg__resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2015 uint8_t *in_far, int w, int hs)
2016 {
2017 /* need to generate 2x2 samples for every one in input */
2018 int i,t0,t1;
2019 if (w == 1)
2020 {
2021 out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2022 return out;
2023 }
2024
2025 t1 = 3*in_near[0] + in_far[0];
2026 out[0] = rjpeg__div4(t1+2);
2027 for (i=1; i < w; ++i)
2028 {
2029 t0 = t1;
2030 t1 = 3*in_near[i]+in_far[i];
2031 out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2032 out[i*2 ] = rjpeg__div16(3*t1 + t0 + 8);
2033 }
2034 out[w*2-1] = rjpeg__div4(t1+2);
2035
2036 (void)hs;
2037
2038 return out;
2039 }
2040
2041 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2042 static uint8_t *rjpeg__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2043 uint8_t *in_far, int w, int hs)
2044 {
2045 /* need to generate 2x2 samples for every one in input */
2046 int i=0,t0,t1;
2047
2048 if (w == 1)
2049 {
2050 out[0] = out[1] = rjpeg__div4(3*in_near[0] + in_far[0] + 2);
2051 return out;
2052 }
2053
2054 t1 = 3*in_near[0] + in_far[0];
2055 /* process groups of 8 pixels for as long as we can.
2056 * note we can't handle the last pixel in a row in this loop
2057 * because we need to handle the filter boundary conditions.
2058 */
2059 for (; i < ((w-1) & ~7); i += 8)
2060 {
2061 #if defined(__SSE2__)
2062 /* load and perform the vertical filtering pass
2063 * this uses 3*x + y = 4*x + (y - x) */
2064 __m128i zero = _mm_setzero_si128();
2065 __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
2066 __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2067 __m128i farw = _mm_unpacklo_epi8(farb, zero);
2068 __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2069 __m128i diff = _mm_sub_epi16(farw, nearw);
2070 __m128i nears = _mm_slli_epi16(nearw, 2);
2071 __m128i curr = _mm_add_epi16(nears, diff); /* current row */
2072
2073 /* horizontal filter works the same based on shifted vers of current
2074 * row. "prev" is current row shifted right by 1 pixel; we need to
2075 * insert the previous pixel value (from t1).
2076 * "next" is current row shifted left by 1 pixel, with first pixel
2077 * of next block of 8 pixels added in.
2078 */
2079 __m128i prv0 = _mm_slli_si128(curr, 2);
2080 __m128i nxt0 = _mm_srli_si128(curr, 2);
2081 __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2082 __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2083
2084 /* horizontal filter, polyphase implementation since it's convenient:
2085 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2086 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2087 * note the shared term. */
2088 __m128i bias = _mm_set1_epi16(8);
2089 __m128i curs = _mm_slli_epi16(curr, 2);
2090 __m128i prvd = _mm_sub_epi16(prev, curr);
2091 __m128i nxtd = _mm_sub_epi16(next, curr);
2092 __m128i curb = _mm_add_epi16(curs, bias);
2093 __m128i even = _mm_add_epi16(prvd, curb);
2094 __m128i odd = _mm_add_epi16(nxtd, curb);
2095
2096 /* interleave even and odd pixels, then undo scaling. */
2097 __m128i int0 = _mm_unpacklo_epi16(even, odd);
2098 __m128i int1 = _mm_unpackhi_epi16(even, odd);
2099 __m128i de0 = _mm_srli_epi16(int0, 4);
2100 __m128i de1 = _mm_srli_epi16(int1, 4);
2101
2102 /* pack and write output */
2103 __m128i outv = _mm_packus_epi16(de0, de1);
2104 _mm_storeu_si128((__m128i *) (out + i*2), outv);
2105 #elif defined(RJPEG_NEON)
2106 /* load and perform the vertical filtering pass
2107 * this uses 3*x + y = 4*x + (y - x) */
2108 uint8x8_t farb = vld1_u8(in_far + i);
2109 uint8x8_t nearb = vld1_u8(in_near + i);
2110 int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2111 int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2112 int16x8_t curr = vaddq_s16(nears, diff); /* current row */
2113
2114 /* horizontal filter works the same based on shifted vers of current
2115 * row. "prev" is current row shifted right by 1 pixel; we need to
2116 * insert the previous pixel value (from t1).
2117 * "next" is current row shifted left by 1 pixel, with first pixel
2118 * of next block of 8 pixels added in. */
2119 int16x8_t prv0 = vextq_s16(curr, curr, 7);
2120 int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2121 int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2122 int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2123
2124 /* horizontal filter, polyphase implementation since it's convenient:
2125 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2126 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2127 * note the shared term.
2128 */
2129 int16x8_t curs = vshlq_n_s16(curr, 2);
2130 int16x8_t prvd = vsubq_s16(prev, curr);
2131 int16x8_t nxtd = vsubq_s16(next, curr);
2132 int16x8_t even = vaddq_s16(curs, prvd);
2133 int16x8_t odd = vaddq_s16(curs, nxtd);
2134
2135 /* undo scaling and round, then store with even/odd phases interleaved */
2136 uint8x8x2_t o;
2137 o.val[0] = vqrshrun_n_s16(even, 4);
2138 o.val[1] = vqrshrun_n_s16(odd, 4);
2139 vst2_u8(out + i*2, o);
2140 #endif
2141
2142 /* "previous" value for next iteration */
2143 t1 = 3*in_near[i+7] + in_far[i+7];
2144 }
2145
2146 t0 = t1;
2147 t1 = 3*in_near[i] + in_far[i];
2148 out[i*2] = rjpeg__div16(3*t1 + t0 + 8);
2149
2150 for (++i; i < w; ++i)
2151 {
2152 t0 = t1;
2153 t1 = 3*in_near[i]+in_far[i];
2154 out[i*2-1] = rjpeg__div16(3*t0 + t1 + 8);
2155 out[i*2 ] = rjpeg__div16(3*t1 + t0 + 8);
2156 }
2157 out[w*2-1] = rjpeg__div4(t1+2);
2158
2159 (void)hs;
2160
2161 return out;
2162 }
2163 #endif
2164
rjpeg__resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2165 static uint8_t *rjpeg__resample_row_generic(uint8_t *out,
2166 uint8_t *in_near, uint8_t *in_far, int w, int hs)
2167 {
2168 /* resample with nearest-neighbor */
2169 int i,j;
2170 (void)in_far;
2171
2172 for (i=0; i < w; ++i)
2173 for (j=0; j < hs; ++j)
2174 out[i*hs+j] = in_near[i];
2175 return out;
2176 }
2177
2178 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2179 * to make sure the code produces the same results in both SIMD and scalar */
2180 #ifndef float2fixed
2181 #define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
2182 #endif
2183
rjpeg__YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2184 static void rjpeg__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2185 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2186 {
2187 int i;
2188 for (i=0; i < count; ++i)
2189 {
2190 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2191 int cr = pcr[i] - 128;
2192 int cb = pcb[i] - 128;
2193 int r = y_fixed + cr* float2fixed(1.40200f);
2194 int g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2195 int b = y_fixed + cb* float2fixed(1.77200f);
2196 r >>= 20;
2197 g >>= 20;
2198 b >>= 20;
2199 if ((unsigned) r > 255)
2200 r = 255;
2201 if ((unsigned) g > 255)
2202 g = 255;
2203 if ((unsigned) b > 255)
2204 b = 255;
2205 out[0] = (uint8_t)r;
2206 out[1] = (uint8_t)g;
2207 out[2] = (uint8_t)b;
2208 out[3] = 255;
2209 out += step;
2210 }
2211 }
2212
2213 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg__YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2214 static void rjpeg__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2215 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2216 {
2217 int i = 0;
2218
2219 #if defined(__SSE2__)
2220 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2221 * it's useful in practice (you wouldn't use it for textures, for example).
2222 * so just accelerate step == 4 case.
2223 */
2224 if (step == 4)
2225 {
2226 /* this is a fairly straightforward implementation and not super-optimized. */
2227 __m128i signflip = _mm_set1_epi8(-0x80);
2228 __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
2229 __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2230 __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2231 __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
2232 __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2233 __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2234
2235 for (; i+7 < count; i += 8)
2236 {
2237 /* load */
2238 __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2239 __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2240 __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2241 __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2242 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2243
2244 /* unpack to short (and left-shift cr, cb by 8) */
2245 __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
2246 __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2247 __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2248
2249 /* color transform */
2250 __m128i yws = _mm_srli_epi16(yw, 4);
2251 __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2252 __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2253 __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2254 __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2255 __m128i rws = _mm_add_epi16(cr0, yws);
2256 __m128i gwt = _mm_add_epi16(cb0, yws);
2257 __m128i bws = _mm_add_epi16(yws, cb1);
2258 __m128i gws = _mm_add_epi16(gwt, cr1);
2259
2260 /* descale */
2261 __m128i rw = _mm_srai_epi16(rws, 4);
2262 __m128i bw = _mm_srai_epi16(bws, 4);
2263 __m128i gw = _mm_srai_epi16(gws, 4);
2264
2265 /* back to byte, set up for transpose */
2266 __m128i brb = _mm_packus_epi16(rw, bw);
2267 __m128i gxb = _mm_packus_epi16(gw, xw);
2268
2269 /* transpose to interleave channels */
2270 __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2271 __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2272 __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2273 __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2274
2275 /* store */
2276 _mm_storeu_si128((__m128i *) (out + 0), o0);
2277 _mm_storeu_si128((__m128i *) (out + 16), o1);
2278 out += 32;
2279 }
2280 }
2281 #endif
2282
2283 #ifdef RJPEG_NEON
2284 /* in this version, step=3 support would be easy to add. but is there demand? */
2285 if (step == 4)
2286 {
2287 /* this is a fairly straightforward implementation and not super-optimized. */
2288 uint8x8_t signflip = vdup_n_u8(0x80);
2289 int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
2290 int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2291 int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2292 int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
2293
2294 for (; i+7 < count; i += 8)
2295 {
2296 uint8x8x4_t o;
2297
2298 /* load */
2299 uint8x8_t y_bytes = vld1_u8(y + i);
2300 uint8x8_t cr_bytes = vld1_u8(pcr + i);
2301 uint8x8_t cb_bytes = vld1_u8(pcb + i);
2302 int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2303 int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2304
2305 /* expand to s16 */
2306 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2307 int16x8_t crw = vshll_n_s8(cr_biased, 7);
2308 int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2309
2310 /* color transform */
2311 int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2312 int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2313 int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2314 int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2315 int16x8_t rws = vaddq_s16(yws, cr0);
2316 int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2317 int16x8_t bws = vaddq_s16(yws, cb1);
2318
2319 /* undo scaling, round, convert to byte */
2320 o.val[0] = vqrshrun_n_s16(rws, 4);
2321 o.val[1] = vqrshrun_n_s16(gws, 4);
2322 o.val[2] = vqrshrun_n_s16(bws, 4);
2323 o.val[3] = vdup_n_u8(255);
2324
2325 /* store, interleaving r/g/b/a */
2326 vst4_u8(out, o);
2327 out += 8*4;
2328 }
2329 }
2330 #endif
2331
2332 for (; i < count; ++i)
2333 {
2334 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2335 int cr = pcr[i] - 128;
2336 int cb = pcb[i] - 128;
2337 int r = y_fixed + cr* float2fixed(1.40200f);
2338 int g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
2339 int b = y_fixed + cb* float2fixed(1.77200f);
2340 r >>= 20;
2341 g >>= 20;
2342 b >>= 20;
2343 if ((unsigned) r > 255)
2344 r = 255;
2345 if ((unsigned) g > 255)
2346 g = 255;
2347 if ((unsigned) b > 255)
2348 b = 255;
2349 out[0] = (uint8_t)r;
2350 out[1] = (uint8_t)g;
2351 out[2] = (uint8_t)b;
2352 out[3] = 255;
2353 out += step;
2354 }
2355 }
2356 #endif
2357
2358 /* set up the kernels */
rjpeg__setup_jpeg(rjpeg__jpeg * j)2359 static void rjpeg__setup_jpeg(rjpeg__jpeg *j)
2360 {
2361 uint64_t mask = cpu_features_get();
2362
2363 (void)mask;
2364
2365 j->idct_block_kernel = rjpeg__idct_block;
2366 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_row;
2367 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2;
2368
2369 #if defined(__SSE2__)
2370 if (mask & RETRO_SIMD_SSE2)
2371 {
2372 j->idct_block_kernel = rjpeg__idct_simd;
2373 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_simd;
2374 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2375 }
2376 #endif
2377
2378 #ifdef RJPEG_NEON
2379 j->idct_block_kernel = rjpeg__idct_simd;
2380 j->YCbCr_to_RGB_kernel = rjpeg__YCbCr_to_RGB_simd;
2381 j->resample_row_hv_2_kernel = rjpeg__resample_row_hv_2_simd;
2382 #endif
2383 }
2384
2385 /* clean up the temporary component buffers */
rjpeg__cleanup_jpeg(rjpeg__jpeg * j)2386 static void rjpeg__cleanup_jpeg(rjpeg__jpeg *j)
2387 {
2388 int i;
2389 for (i=0; i < j->s->img_n; ++i)
2390 {
2391 if (j->img_comp[i].raw_data)
2392 {
2393 free(j->img_comp[i].raw_data);
2394 j->img_comp[i].raw_data = NULL;
2395 j->img_comp[i].data = NULL;
2396 }
2397
2398 if (j->img_comp[i].raw_coeff)
2399 {
2400 free(j->img_comp[i].raw_coeff);
2401 j->img_comp[i].raw_coeff = 0;
2402 j->img_comp[i].coeff = 0;
2403 }
2404
2405 if (j->img_comp[i].linebuf)
2406 {
2407 free(j->img_comp[i].linebuf);
2408 j->img_comp[i].linebuf = NULL;
2409 }
2410 }
2411 }
2412
rjpeg_load_jpeg_image(rjpeg__jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2413 static uint8_t *rjpeg_load_jpeg_image(rjpeg__jpeg *z,
2414 unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2415 {
2416 int n, decode_n;
2417 int k;
2418 unsigned int i,j;
2419 rjpeg__resample res_comp[4];
2420 uint8_t *coutput[4] = {0};
2421 uint8_t *output = NULL;
2422 z->s->img_n = 0;
2423
2424 /* load a jpeg image from whichever source, but leave in YCbCr format */
2425 if (!rjpeg__decode_jpeg_image(z))
2426 goto error;
2427
2428 /* determine actual number of components to generate */
2429 n = req_comp ? req_comp : z->s->img_n;
2430
2431 if (z->s->img_n == 3 && n < 3)
2432 decode_n = 1;
2433 else
2434 decode_n = z->s->img_n;
2435
2436 /* resample and color-convert */
2437 for (k=0; k < decode_n; ++k)
2438 {
2439 rjpeg__resample *r = &res_comp[k];
2440
2441 /* allocate line buffer big enough for upsampling off the edges
2442 * with upsample factor of 4 */
2443 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2444 if (!z->img_comp[k].linebuf)
2445 goto error;
2446
2447 r->hs = z->img_h_max / z->img_comp[k].h;
2448 r->vs = z->img_v_max / z->img_comp[k].v;
2449 r->ystep = r->vs >> 1;
2450 r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
2451 r->ypos = 0;
2452 r->line0 = r->line1 = z->img_comp[k].data;
2453 r->resample = rjpeg__resample_row_generic;
2454
2455 if (r->hs == 1 && r->vs == 1)
2456 r->resample = rjpeg_resample_row_1;
2457 else if (r->hs == 1 && r->vs == 2)
2458 r->resample = rjpeg__resample_row_v_2;
2459 else if (r->hs == 2 && r->vs == 1)
2460 r->resample = rjpeg__resample_row_h_2;
2461 else if (r->hs == 2 && r->vs == 2)
2462 r->resample = z->resample_row_hv_2_kernel;
2463 }
2464
2465 /* can't error after this so, this is safe */
2466 output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2467
2468 if (!output)
2469 goto error;
2470
2471 /* now go ahead and resample */
2472 for (j=0; j < z->s->img_y; ++j)
2473 {
2474 uint8_t *out = output + n * z->s->img_x * j;
2475 for (k=0; k < decode_n; ++k)
2476 {
2477 rjpeg__resample *r = &res_comp[k];
2478 int y_bot = r->ystep >= (r->vs >> 1);
2479
2480 coutput[k] = r->resample(z->img_comp[k].linebuf,
2481 y_bot ? r->line1 : r->line0,
2482 y_bot ? r->line0 : r->line1,
2483 r->w_lores, r->hs);
2484
2485 if (++r->ystep >= r->vs)
2486 {
2487 r->ystep = 0;
2488 r->line0 = r->line1;
2489 if (++r->ypos < z->img_comp[k].y)
2490 r->line1 += z->img_comp[k].w2;
2491 }
2492 }
2493
2494 if (n >= 3)
2495 {
2496 uint8_t *y = coutput[0];
2497 if (y)
2498 {
2499 if (z->s->img_n == 3)
2500 z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2501 else
2502 for (i=0; i < z->s->img_x; ++i)
2503 {
2504 out[0] = out[1] = out[2] = y[i];
2505 out[3] = 255; /* not used if n==3 */
2506 out += n;
2507 }
2508 }
2509 }
2510 else
2511 {
2512 uint8_t *y = coutput[0];
2513 if (n == 1)
2514 for (i=0; i < z->s->img_x; ++i)
2515 out[i] = y[i];
2516 else
2517 for (i=0; i < z->s->img_x; ++i)
2518 {
2519 *out++ = y[i];
2520 *out++ = 255;
2521 }
2522 }
2523 }
2524
2525 rjpeg__cleanup_jpeg(z);
2526 *out_x = z->s->img_x;
2527 *out_y = z->s->img_y;
2528
2529 if (comp)
2530 *comp = z->s->img_n; /* report original components, not output */
2531 return output;
2532
2533 error:
2534 rjpeg__cleanup_jpeg(z);
2535 return NULL;
2536 }
2537
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2538 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2539 size_t size, unsigned *width, unsigned *height)
2540 {
2541 rjpeg__jpeg j;
2542 rjpeg__context s;
2543 int comp;
2544 uint32_t *img = NULL;
2545 uint32_t *pixels = NULL;
2546 unsigned size_tex = 0;
2547
2548 if (!rjpeg)
2549 return IMAGE_PROCESS_ERROR;
2550
2551 s.img_buffer = (uint8_t*)rjpeg->buff_data;
2552 s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2553 s.img_buffer_end = (uint8_t*)rjpeg->buff_data + (int)size;
2554
2555 j.s = &s;
2556
2557 rjpeg__setup_jpeg(&j);
2558
2559 img = (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2560
2561 if (!img)
2562 return IMAGE_PROCESS_ERROR;
2563
2564 size_tex = (*width) * (*height);
2565 pixels = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2566
2567 if (!pixels)
2568 {
2569 free(img);
2570 return IMAGE_PROCESS_ERROR;
2571 }
2572
2573 *buf_data = pixels;
2574
2575 /* Convert RGBA to ARGB */
2576 while (size_tex--)
2577 {
2578 unsigned int texel = img[size_tex];
2579 unsigned int A = texel & 0xFF000000;
2580 unsigned int B = texel & 0x00FF0000;
2581 unsigned int G = texel & 0x0000FF00;
2582 unsigned int R = texel & 0x000000FF;
2583 ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2584 }
2585
2586 free(img);
2587
2588 return IMAGE_PROCESS_END;
2589 }
2590
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2591 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2592 {
2593 if (!rjpeg)
2594 return false;
2595
2596 rjpeg->buff_data = (uint8_t*)data;
2597
2598 return true;
2599 }
2600
rjpeg_free(rjpeg_t * rjpeg)2601 void rjpeg_free(rjpeg_t *rjpeg)
2602 {
2603 if (!rjpeg)
2604 return;
2605
2606 free(rjpeg);
2607 }
2608
rjpeg_alloc(void)2609 rjpeg_t *rjpeg_alloc(void)
2610 {
2611 rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2612 if (!rjpeg)
2613 return NULL;
2614 return rjpeg;
2615 }
2616