1 /* Copyright (C) 2010-2020 The RetroArch team
2 *
3 * ---------------------------------------------------------------------------------------
4 * The following license statement only applies to this file (rjpeg.c).
5 * ---------------------------------------------------------------------------------------
6 *
7 * Permission is hereby granted, free of charge,
8 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation the rights to
10 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 /* Modified version of stb_image's JPEG sources. */
24
25 #include <stdint.h>
26 #include <stdarg.h>
27 #include <stddef.h> /* ptrdiff_t on osx */
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include <retro_assert.h>
32 #include <retro_inline.h>
33 #include <boolean.h>
34 #include <formats/image.h>
35 #include <formats/rjpeg.h>
36 #include <features/features_cpu.h>
37
38 enum
39 {
40 RJPEG_DEFAULT = 0, /* only used for req_comp */
41 RJPEG_GREY,
42 RJPEG_GREY_ALPHA,
43 RJPEG_RGB,
44 RJPEG_RGB_ALPHA
45 };
46
47 enum
48 {
49 RJPEG_SCAN_LOAD = 0,
50 RJPEG_SCAN_TYPE,
51 RJPEG_SCAN_HEADER
52 };
53
54 typedef uint8_t *(*rjpeg_resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1,
55 int w, int hs);
56
57 typedef struct
58 {
59 rjpeg_resample_row_func resample;
60 uint8_t *line0;
61 uint8_t *line1;
62 int hs,vs; /* expansion factor in each axis */
63 int w_lores; /* horizontal pixels pre-expansion */
64 int ystep; /* how far through vertical expansion we are */
65 int ypos; /* which pre-expansion row we're on */
66 } rjpeg_resample;
67
68 struct rjpeg
69 {
70 uint8_t *buff_data;
71 };
72
73 #ifdef _MSC_VER
74 #define RJPEG_HAS_LROTL
75 #endif
76
77 #ifdef RJPEG_HAS_LROTL
78 #define RJPEG_LROT(x,y) _lrotl(x,y)
79 #else
80 #define RJPEG_LROT(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
81 #endif
82
83 /* x86/x64 detection */
84 #if defined(__x86_64__) || defined(_M_X64)
85 #define RJPEG_X64_TARGET
86 #elif defined(__i386) || defined(_M_IX86)
87 #define RJPEG_X86_TARGET
88 #endif
89
90 #if defined(__GNUC__) && (defined(RJPEG_X86_TARGET) || defined(RJPEG_X64_TARGET)) && !defined(__SSE2__) && !defined(RJPEG_NO_SIMD)
91 /* NOTE: not clear do we actually need this for the 64-bit path?
92 * gcc doesn't support sse2 intrinsics unless you compile with -msse2,
93 * (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
94 * this is just broken and gcc are jerks for not fixing it properly
95 * http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
96 */
97 #define RJPEG_NO_SIMD
98 #endif
99
100 #if defined(__MINGW32__) && defined(RJPEG_X86_TARGET) && !defined(RJPEG_MINGW_ENABLE_SSE2) && !defined(RJPEG_NO_SIMD)
101 /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid RJPEG_X64_TARGET
102 *
103 * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
104 * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
105 * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
106 * simultaneously enabling "-mstackrealign".
107 *
108 * See https://github.com/nothings/stb/issues/81 for more information.
109 *
110 * So default to no SSE2 on 32-bit MinGW. If you've read this far and added
111 * -mstackrealign to your build settings, feel free to #define RJPEG_MINGW_ENABLE_SSE2.
112 */
113 #define RJPEG_NO_SIMD
114 #endif
115
116 #if defined(__SSE2__)
117 #include <emmintrin.h>
118
119 #ifdef _MSC_VER
120 #define RJPEG_SIMD_ALIGN(type, name) __declspec(align(16)) type name
121 #else
122 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
123 #endif
124
125 #endif
126
127 /* ARM NEON */
128 #if defined(RJPEG_NO_SIMD) && defined(RJPEG_NEON)
129 #undef RJPEG_NEON
130 #endif
131
132 #ifdef RJPEG_NEON
133 #include <arm_neon.h>
134 /* assume GCC or Clang on ARM targets */
135 #define RJPEG_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
136 #endif
137
138 #ifndef RJPEG_SIMD_ALIGN
139 #define RJPEG_SIMD_ALIGN(type, name) type name
140 #endif
141
142 typedef struct
143 {
144 uint8_t *img_buffer;
145 uint8_t *img_buffer_end;
146 uint8_t *img_buffer_original;
147 int img_n;
148 int img_out_n;
149 int buflen;
150 uint32_t img_x;
151 uint32_t img_y;
152 uint8_t buffer_start[128];
153 } rjpeg_context;
154
rjpeg_get8(rjpeg_context * s)155 static INLINE uint8_t rjpeg_get8(rjpeg_context *s)
156 {
157 if (s->img_buffer < s->img_buffer_end)
158 return *s->img_buffer++;
159
160 return 0;
161 }
162
163 #define RJPEG_AT_EOF(s) ((s)->img_buffer >= (s)->img_buffer_end)
164
165 #define RJPEG_GET16BE(s) ((rjpeg_get8((s)) << 8) + rjpeg_get8((s)))
166
167 /* huffman decoding acceleration */
168 #define FAST_BITS 9 /* larger handles more cases; smaller stomps less cache */
169
170 typedef struct
171 {
172 unsigned int maxcode[18];
173 int delta[17]; /* old 'firstsymbol' - old 'firstcode' */
174 /* weirdly, repacking this into AoS is a 10% speed loss, instead of a win */
175 uint16_t code[256];
176 uint8_t fast[1 << FAST_BITS];
177 uint8_t values[256];
178 uint8_t size[257];
179 } rjpeg_huffman;
180
181 typedef struct
182 {
183 rjpeg_context *s;
184 /* kernels */
185 void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]);
186 void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb,
187 const uint8_t *pcr, int count, int step);
188 uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near,
189 uint8_t *in_far, int w, int hs);
190
191 /* definition of jpeg image component */
192 struct
193 {
194 uint8_t *data;
195 void *raw_data, *raw_coeff;
196 uint8_t *linebuf;
197 short *coeff; /* progressive only */
198 int id;
199 int h,v;
200 int tq;
201 int hd,ha;
202 int dc_pred;
203
204 int x,y,w2,h2;
205 int coeff_w; /* number of 8x8 coefficient blocks */
206 int coeff_h; /* number of 8x8 coefficient blocks */
207 } img_comp[4];
208
209 /* sizes for components, interleaved MCUs */
210 int img_h_max, img_v_max;
211 int img_mcu_x, img_mcu_y;
212 int img_mcu_w, img_mcu_h;
213
214 int code_bits; /* number of valid bits */
215 int nomore; /* flag if we saw a marker so must stop */
216 int progressive;
217 int spec_start;
218 int spec_end;
219 int succ_high;
220 int succ_low;
221 int eob_run;
222 int scan_n, order[4];
223 int restart_interval, todo;
224 uint32_t code_buffer; /* jpeg entropy-coded buffer */
225 rjpeg_huffman huff_dc[4]; /* unsigned int alignment */
226 rjpeg_huffman huff_ac[4]; /* unsigned int alignment */
227 int16_t fast_ac[4][1 << FAST_BITS];
228 unsigned char marker; /* marker seen while filling entropy buffer */
229 uint8_t dequant[4][64];
230 } rjpeg_jpeg;
231
232 #define RJPEG_F2F(x) ((int) (((x) * 4096 + 0.5)))
233 #define RJPEG_FSH(x) ((x) << 12)
234
235 #define RJPEG_MARKER_NONE 0xff
236 /* if there's a pending marker from the entropy stream, return that
237 * otherwise, fetch from the stream and get a marker. if there's no
238 * marker, return 0xff, which is never a valid marker value
239 */
240
241 /* in each scan, we'll have scan_n components, and the order
242 * of the components is specified by order[]
243 */
244 #define RJPEG_RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
245
246 #define JPEG_MARKER 0xFF
247 #define JPEG_MARKER_SOI 0xD8
248 #define JPEG_MARKER_SOS 0xDA
249 #define JPEG_MARKER_EOI 0xD9
250 #define JPEG_MARKER_APP1 0xE1
251 #define JPEG_MARKER_APP2 0xE2
252
253 /* use comparisons since in some cases we handle more than one case (e.g. SOF) */
254 #define RJPEG_SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
255
256 #define RJPEG_SOF_PROGRESSIVE(x) ((x) == 0xc2)
257 #define RJPEG_DIV4(x) ((uint8_t) ((x) >> 2))
258 #define RJPEG_DIV16(x) ((uint8_t) ((x) >> 4))
259
rjpeg_build_huffman(rjpeg_huffman * h,int * count)260 static int rjpeg_build_huffman(rjpeg_huffman *h, int *count)
261 {
262 int i,j,k = 0,code;
263
264 /* build size list for each symbol (from JPEG spec) */
265 for (i = 0; i < 16; ++i)
266 for (j = 0; j < count[i]; ++j)
267 h->size[k++] = (uint8_t) (i+1);
268
269 h->size[k] = 0;
270 /* compute actual symbols (from jpeg spec) */
271 code = 0;
272 k = 0;
273
274 for (j = 1; j <= 16; ++j)
275 {
276 /* compute delta to add to code to compute symbol id */
277 h->delta[j] = k - code;
278 if (h->size[k] == j)
279 {
280 while (h->size[k] == j)
281 h->code[k++] = (uint16_t) (code++);
282
283 /* Bad code lengths, corrupt JPEG? */
284 if (code-1 >= (1 << j))
285 return 0;
286 }
287 /* compute largest code + 1 for this size, preshifted as needed later */
288 h->maxcode[j] = code << (16-j);
289 code <<= 1;
290 }
291 h->maxcode[j] = 0xffffffff;
292
293 /* build non-spec acceleration table; 255 is flag for not-accelerated */
294 memset(h->fast, 255, 1 << FAST_BITS);
295 for (i = 0; i < k; ++i)
296 {
297 int s = h->size[i];
298 if (s <= FAST_BITS)
299 {
300 int c = h->code[i] << (FAST_BITS-s);
301 int m = 1 << (FAST_BITS-s);
302 for (j = 0; j < m; ++j)
303 h->fast[c+j] = (uint8_t) i;
304 }
305 }
306 return 1;
307 }
308
309 /* build a table that decodes both magnitude and value of small ACs in
310 * one go. */
rjpeg_build_fast_ac(int16_t * fast_ac,rjpeg_huffman * h)311 static void rjpeg_build_fast_ac(int16_t *fast_ac, rjpeg_huffman *h)
312 {
313 int i;
314
315 for (i = 0; i < (1 << FAST_BITS); ++i)
316 {
317 uint8_t fast = h->fast[i];
318
319 fast_ac[i] = 0;
320
321 if (fast < 255)
322 {
323 int rs = h->values[fast];
324 int run = (rs >> 4) & 15;
325 int magbits = rs & 15;
326 int len = h->size[fast];
327
328 if (magbits && len + magbits <= FAST_BITS)
329 {
330 /* magnitude code followed by receive_extend code */
331 int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
332 int m = 1 << (magbits - 1);
333 if (k < m)
334 k += (-1 << magbits) + 1;
335
336 /* if the result is small enough, we can fit it in fast_ac table */
337 if (k >= -128 && k <= 127)
338 fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits));
339 }
340 }
341 }
342 }
343
rjpeg_grow_buffer_unsafe(rjpeg_jpeg * j)344 static void rjpeg_grow_buffer_unsafe(rjpeg_jpeg *j)
345 {
346 do
347 {
348 int b = j->nomore ? 0 : rjpeg_get8(j->s);
349 if (b == 0xff)
350 {
351 int c = rjpeg_get8(j->s);
352
353 if (c != 0)
354 {
355 j->marker = (unsigned char) c;
356 j->nomore = 1;
357 return;
358 }
359 }
360 j->code_buffer |= b << (24 - j->code_bits);
361 j->code_bits += 8;
362 } while (j->code_bits <= 24);
363 }
364
365 /* (1 << n) - 1 */
366 static uint32_t rjpeg_bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
367
368 /* decode a JPEG huffman value from the bitstream */
rjpeg_jpeg_huff_decode(rjpeg_jpeg * j,rjpeg_huffman * h)369 static INLINE int rjpeg_jpeg_huff_decode(rjpeg_jpeg *j, rjpeg_huffman *h)
370 {
371 unsigned int temp;
372 int c,k;
373
374 if (j->code_bits < 16)
375 rjpeg_grow_buffer_unsafe(j);
376
377 /* look at the top FAST_BITS and determine what symbol ID it is,
378 * if the code is <= FAST_BITS */
379 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
380 k = h->fast[c];
381
382 if (k < 255)
383 {
384 int s = h->size[k];
385 if (s > j->code_bits)
386 return -1;
387 j->code_buffer <<= s;
388 j->code_bits -= s;
389 return h->values[k];
390 }
391
392 /* naive test is to shift the code_buffer down so k bits are
393 * valid, then test against maxcode. To speed this up, we've
394 * preshifted maxcode left so that it has (16-k) 0s at the
395 * end; in other words, regardless of the number of bits, it
396 * wants to be compared against something shifted to have 16;
397 * that way we don't need to shift inside the loop. */
398 temp = j->code_buffer >> 16;
399 for (k=FAST_BITS+1 ; ; ++k)
400 if (temp < h->maxcode[k])
401 break;
402
403 if (k == 17)
404 {
405 /* error! code not found */
406 j->code_bits -= 16;
407 return -1;
408 }
409
410 if (k > j->code_bits)
411 return -1;
412
413 /* convert the huffman code to the symbol id */
414 c = ((j->code_buffer >> (32 - k)) & rjpeg_bmask[k]) + h->delta[k];
415 retro_assert((((j->code_buffer) >> (32 - h->size[c])) & rjpeg_bmask[h->size[c]]) == h->code[c]);
416
417 /* convert the id to a symbol */
418 j->code_bits -= k;
419 j->code_buffer <<= k;
420 return h->values[c];
421 }
422
423 /* bias[n] = (-1<<n) + 1 */
424 static int const rjpeg_jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
425
426 /* combined JPEG 'receive' and JPEG 'extend', since baseline
427 * always extends everything it receives. */
rjpeg_extend_receive(rjpeg_jpeg * j,int n)428 static INLINE int rjpeg_extend_receive(rjpeg_jpeg *j, int n)
429 {
430 unsigned int k;
431 int sgn;
432 if (j->code_bits < n)
433 rjpeg_grow_buffer_unsafe(j);
434
435 sgn = (int32_t)j->code_buffer >> 31; /* sign bit is always in MSB */
436 k = RJPEG_LROT(j->code_buffer, n);
437 retro_assert(n >= 0 && n < (int) (sizeof(rjpeg_bmask)/sizeof(*rjpeg_bmask)));
438 j->code_buffer = k & ~rjpeg_bmask[n];
439 k &= rjpeg_bmask[n];
440 j->code_bits -= n;
441 return k + (rjpeg_jbias[n] & ~sgn);
442 }
443
444 /* get some unsigned bits */
rjpeg_jpeg_get_bits(rjpeg_jpeg * j,int n)445 static INLINE int rjpeg_jpeg_get_bits(rjpeg_jpeg *j, int n)
446 {
447 unsigned int k;
448 if (j->code_bits < n)
449 rjpeg_grow_buffer_unsafe(j);
450 k = RJPEG_LROT(j->code_buffer, n);
451 j->code_buffer = k & ~rjpeg_bmask[n];
452 k &= rjpeg_bmask[n];
453 j->code_bits -= n;
454 return k;
455 }
456
rjpeg_jpeg_get_bit(rjpeg_jpeg * j)457 static INLINE int rjpeg_jpeg_get_bit(rjpeg_jpeg *j)
458 {
459 unsigned int k;
460 if (j->code_bits < 1)
461 rjpeg_grow_buffer_unsafe(j);
462
463 k = j->code_buffer;
464 j->code_buffer <<= 1;
465 --j->code_bits;
466 return k & 0x80000000;
467 }
468
469 /* given a value that's at position X in the zigzag stream,
470 * where does it appear in the 8x8 matrix coded as row-major? */
471 static uint8_t rjpeg_jpeg_dezigzag[64+15] =
472 {
473 0, 1, 8, 16, 9, 2, 3, 10,
474 17, 24, 32, 25, 18, 11, 4, 5,
475 12, 19, 26, 33, 40, 48, 41, 34,
476 27, 20, 13, 6, 7, 14, 21, 28,
477 35, 42, 49, 56, 57, 50, 43, 36,
478 29, 22, 15, 23, 30, 37, 44, 51,
479 58, 59, 52, 45, 38, 31, 39, 46,
480 53, 60, 61, 54, 47, 55, 62, 63,
481 /* let corrupt input sample past end */
482 63, 63, 63, 63, 63, 63, 63, 63,
483 63, 63, 63, 63, 63, 63, 63
484 };
485
486 /* decode one 64-entry block-- */
rjpeg_jpeg_decode_block(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hdc,rjpeg_huffman * hac,int16_t * fac,int b,uint8_t * dequant)487 static int rjpeg_jpeg_decode_block(
488 rjpeg_jpeg *j, short data[64],
489 rjpeg_huffman *hdc,
490 rjpeg_huffman *hac,
491 int16_t *fac,
492 int b,
493 uint8_t *dequant)
494 {
495 int dc,k;
496 int t;
497 int diff = 0;
498
499 if (j->code_bits < 16)
500 rjpeg_grow_buffer_unsafe(j);
501 t = rjpeg_jpeg_huff_decode(j, hdc);
502
503 /* Bad huffman code. Corrupt JPEG? */
504 if (t < 0)
505 return 0;
506
507 /* 0 all the ac values now so we can do it 32-bits at a time */
508 memset(data,0,64*sizeof(data[0]));
509
510 if (t)
511 diff = rjpeg_extend_receive(j, t);
512 dc = j->img_comp[b].dc_pred + diff;
513 j->img_comp[b].dc_pred = dc;
514 data[0] = (short) (dc * dequant[0]);
515
516 /* decode AC components, see JPEG spec */
517 k = 1;
518 do
519 {
520 unsigned int zig;
521 int c,r,s;
522 if (j->code_bits < 16)
523 rjpeg_grow_buffer_unsafe(j);
524 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
525 r = fac[c];
526 if (r)
527 {
528 /* fast-AC path */
529 k += (r >> 4) & 15; /* run */
530 s = r & 15; /* combined length */
531 j->code_buffer <<= s;
532 j->code_bits -= s;
533 /* decode into unzigzag'd location */
534 zig = rjpeg_jpeg_dezigzag[k++];
535 data[zig] = (short) ((r >> 8) * dequant[zig]);
536 }
537 else
538 {
539 int rs = rjpeg_jpeg_huff_decode(j, hac);
540
541 /* Bad huffman code. Corrupt JPEG? */
542 if (rs < 0)
543 return 0;
544
545 s = rs & 15;
546 r = rs >> 4;
547 if (s == 0)
548 {
549 if (rs != 0xf0)
550 break; /* end block */
551 k += 16;
552 }
553 else
554 {
555 k += r;
556 /* decode into unzigzag'd location */
557 zig = rjpeg_jpeg_dezigzag[k++];
558 data[zig] = (short) (rjpeg_extend_receive(j,s) * dequant[zig]);
559 }
560 }
561 } while (k < 64);
562 return 1;
563 }
564
rjpeg_jpeg_decode_block_prog_dc(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hdc,int b)565 static int rjpeg_jpeg_decode_block_prog_dc(
566 rjpeg_jpeg *j,
567 short data[64],
568 rjpeg_huffman *hdc,
569 int b)
570 {
571 /* Can't merge DC and AC. Corrupt JPEG? */
572 if (j->spec_end != 0)
573 return 0;
574
575 if (j->code_bits < 16)
576 rjpeg_grow_buffer_unsafe(j);
577
578 if (j->succ_high == 0)
579 {
580 int t;
581 int dc;
582 int diff = 0;
583
584 /* first scan for DC coefficient, must be first */
585 memset(data,0,64*sizeof(data[0])); /* 0 all the ac values now */
586 t = rjpeg_jpeg_huff_decode(j, hdc);
587 if (t)
588 diff = rjpeg_extend_receive(j, t);
589
590 dc = j->img_comp[b].dc_pred + diff;
591 j->img_comp[b].dc_pred = dc;
592 data[0] = (short) (dc << j->succ_low);
593 }
594 else
595 {
596 /* refinement scan for DC coefficient */
597 if (rjpeg_jpeg_get_bit(j))
598 data[0] += (short) (1 << j->succ_low);
599 }
600 return 1;
601 }
602
rjpeg_jpeg_decode_block_prog_ac(rjpeg_jpeg * j,short data[64],rjpeg_huffman * hac,int16_t * fac)603 static int rjpeg_jpeg_decode_block_prog_ac(
604 rjpeg_jpeg *j,
605 short data[64],
606 rjpeg_huffman *hac,
607 int16_t *fac)
608 {
609 int k;
610
611 /* Can't merge DC and AC. Corrupt JPEG? */
612 if (j->spec_start == 0)
613 return 0;
614
615 if (j->succ_high == 0)
616 {
617 int shift = j->succ_low;
618
619 if (j->eob_run)
620 {
621 --j->eob_run;
622 return 1;
623 }
624
625 k = j->spec_start;
626 do
627 {
628 unsigned int zig;
629 int c,r,s;
630 if (j->code_bits < 16)
631 rjpeg_grow_buffer_unsafe(j);
632 c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
633 r = fac[c];
634 if (r)
635 {
636 /* fast-AC path */
637 k += (r >> 4) & 15; /* run */
638 s = r & 15; /* combined length */
639 j->code_buffer <<= s;
640 j->code_bits -= s;
641 zig = rjpeg_jpeg_dezigzag[k++];
642 data[zig] = (short) ((r >> 8) << shift);
643 }
644 else
645 {
646 int rs = rjpeg_jpeg_huff_decode(j, hac);
647
648 /* Bad huffman code. Corrupt JPEG? */
649 if (rs < 0)
650 return 0;
651
652 s = rs & 15;
653 r = rs >> 4;
654 if (s == 0)
655 {
656 if (r < 15)
657 {
658 j->eob_run = (1 << r);
659 if (r)
660 j->eob_run += rjpeg_jpeg_get_bits(j, r);
661 --j->eob_run;
662 break;
663 }
664 k += 16;
665 }
666 else
667 {
668 k += r;
669 zig = rjpeg_jpeg_dezigzag[k++];
670 data[zig] = (short) (rjpeg_extend_receive(j,s) << shift);
671 }
672 }
673 } while (k <= j->spec_end);
674 }
675 else
676 {
677 /* refinement scan for these AC coefficients */
678
679 short bit = (short) (1 << j->succ_low);
680
681 if (j->eob_run)
682 {
683 --j->eob_run;
684 for (k = j->spec_start; k <= j->spec_end; ++k)
685 {
686 short *p = &data[rjpeg_jpeg_dezigzag[k]];
687 if (*p != 0)
688 if (rjpeg_jpeg_get_bit(j))
689 if ((*p & bit) == 0)
690 {
691 if (*p > 0)
692 *p += bit;
693 else
694 *p -= bit;
695 }
696 }
697 }
698 else
699 {
700 k = j->spec_start;
701 do
702 {
703 int r,s;
704 int rs = rjpeg_jpeg_huff_decode(j, hac);
705
706 /* Bad huffman code. Corrupt JPEG? */
707 if (rs < 0)
708 return 0;
709
710 s = rs & 15;
711 r = rs >> 4;
712 if (s == 0)
713 {
714 if (r < 15)
715 {
716 j->eob_run = (1 << r) - 1;
717 if (r)
718 j->eob_run += rjpeg_jpeg_get_bits(j, r);
719 r = 64; /* force end of block */
720 }
721 else
722 {
723 /* r=15 s=0 should write 16 0s, so we just do
724 * a run of 15 0s and then write s (which is 0),
725 * so we don't have to do anything special here */
726 }
727 }
728 else
729 {
730 /* Bad huffman code. Corrupt JPEG? */
731 if (s != 1)
732 return 0;
733
734 /* sign bit */
735 if (rjpeg_jpeg_get_bit(j))
736 s = bit;
737 else
738 s = -bit;
739 }
740
741 /* advance by r */
742 while (k <= j->spec_end)
743 {
744 short *p = &data[rjpeg_jpeg_dezigzag[k++]];
745 if (*p != 0)
746 {
747 if (rjpeg_jpeg_get_bit(j))
748 if ((*p & bit) == 0)
749 {
750 if (*p > 0)
751 *p += bit;
752 else
753 *p -= bit;
754 }
755 }
756 else
757 {
758 if (r == 0)
759 {
760 *p = (short) s;
761 break;
762 }
763 --r;
764 }
765 }
766 } while (k <= j->spec_end);
767 }
768 }
769 return 1;
770 }
771
772 /* take a -128..127 value and rjpeg_clamp it and convert to 0..255 */
rjpeg_clamp(int x)773 static INLINE uint8_t rjpeg_clamp(int x)
774 {
775 /* trick to use a single test to catch both cases */
776 if ((unsigned int) x > 255)
777 return 255;
778 return (uint8_t) x;
779 }
780
781 /* derived from jidctint -- DCT_ISLOW */
782 #define RJPEG_IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
783 int t0,t1,p4,p5,x0,x1,x2,x3; \
784 int p2 = s2; \
785 int p3 = s6; \
786 int p1 = (p2+p3) * RJPEG_F2F(0.5411961f); \
787 int t2 = p1 + p3 * RJPEG_F2F(-1.847759065f);\
788 int t3 = p1 + p2 * RJPEG_F2F( 0.765366865f);\
789 p2 = s0; \
790 p3 = s4; \
791 t0 = RJPEG_FSH(p2+p3); \
792 t1 = RJPEG_FSH(p2-p3); \
793 x0 = t0+t3; \
794 x3 = t0-t3; \
795 x1 = t1+t2; \
796 x2 = t1-t2; \
797 t0 = s7; \
798 t1 = s5; \
799 t2 = s3; \
800 t3 = s1; \
801 p3 = t0+t2; \
802 p4 = t1+t3; \
803 p1 = t0+t3; \
804 p2 = t1+t2; \
805 p5 = (p3+p4) * RJPEG_F2F( 1.175875602f); \
806 t0 = t0 * RJPEG_F2F( 0.298631336f); \
807 t1 = t1 * RJPEG_F2F( 2.053119869f); \
808 t2 = t2 * RJPEG_F2F( 3.072711026f); \
809 t3 = t3 * RJPEG_F2F( 1.501321110f); \
810 p1 = p5 + p1 * RJPEG_F2F(-0.899976223f); \
811 p2 = p5 + p2 * RJPEG_F2F(-2.562915447f); \
812 p3 = p3 * RJPEG_F2F(-1.961570560f); \
813 p4 = p4 * RJPEG_F2F(-0.390180644f); \
814 t3 += p1+p4; \
815 t2 += p2+p3; \
816 t1 += p2+p4; \
817 t0 += p1+p3
818
rjpeg_idct_block(uint8_t * out,int out_stride,short data[64])819 static void rjpeg_idct_block(uint8_t *out, int out_stride, short data[64])
820 {
821 int i,val[64],*v=val;
822 uint8_t *o = NULL;
823 int16_t *d = data;
824
825 /* columns */
826 for (i = 0; i < 8; ++i,++d, ++v)
827 {
828 /* if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing */
829 if ( d[ 8] == 0
830 && d[16] == 0
831 && d[24] == 0
832 && d[32] == 0
833 && d[40] == 0
834 && d[48] == 0
835 && d[56] == 0)
836 {
837 /* no shortcut 0 seconds
838 * (1|2|3|4|5|6|7)==0 0 seconds
839 * all separate -0.047 seconds
840 * 1 && 2|3 && 4|5 && 6|7: -0.047 seconds */
841 int dcterm = d[0] << 2;
842 v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
843 }
844 else
845 {
846 RJPEG_IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]);
847
848 /* constants scaled things up by 1<<12; let's bring them back
849 * down, but keep 2 extra bits of precision */
850 x0 += 512;
851 x1 += 512;
852 x2 += 512;
853 x3 += 512;
854
855 v[ 0] = (x0+t3) >> 10;
856 v[56] = (x0-t3) >> 10;
857 v[ 8] = (x1+t2) >> 10;
858 v[48] = (x1-t2) >> 10;
859 v[16] = (x2+t1) >> 10;
860 v[40] = (x2-t1) >> 10;
861 v[24] = (x3+t0) >> 10;
862 v[32] = (x3-t0) >> 10;
863 }
864 }
865
866 for (i = 0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride)
867 {
868 /* no fast case since the first 1D IDCT spread components out */
869 RJPEG_IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]);
870
871 /* constants scaled things up by 1<<12, plus we had 1<<2 from first
872 * loop, plus horizontal and vertical each scale by sqrt(8) so together
873 * we've got an extra 1<<3, so 1<<17 total we need to remove.
874 * so we want to round that, which means adding 0.5 * 1<<17,
875 * aka 65536. Also, we'll end up with -128 to 127 that we want
876 * to encode as 0..255 by adding 128, so we'll add that before the shift
877 */
878 x0 += 65536 + (128<<17);
879 x1 += 65536 + (128<<17);
880 x2 += 65536 + (128<<17);
881 x3 += 65536 + (128<<17);
882
883 /* Tried computing the shifts into temps, or'ing the temps to see
884 * if any were out of range, but that was slower */
885 o[0] = rjpeg_clamp((x0+t3) >> 17);
886 o[7] = rjpeg_clamp((x0-t3) >> 17);
887 o[1] = rjpeg_clamp((x1+t2) >> 17);
888 o[6] = rjpeg_clamp((x1-t2) >> 17);
889 o[2] = rjpeg_clamp((x2+t1) >> 17);
890 o[5] = rjpeg_clamp((x2-t1) >> 17);
891 o[3] = rjpeg_clamp((x3+t0) >> 17);
892 o[4] = rjpeg_clamp((x3-t0) >> 17);
893 }
894 }
895
896 #if defined(__SSE2__)
897 /* sse2 integer IDCT. not the fastest possible implementation but it
898 * produces bit-identical results to the generic C version so it's
899 * fully "transparent".
900 */
rjpeg_idct_simd(uint8_t * out,int out_stride,short data[64])901 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
902 {
903 /* This is constructed to match our regular (generic) integer IDCT exactly. */
904 __m128i row0, row1, row2, row3, row4, row5, row6, row7;
905 __m128i tmp;
906
907 /* dot product constant: even elems=x, odd elems=y */
908 #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
909
910 /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
911 * out(1) = c1[even]*x + c1[odd]*y
912 */
913 #define dct_rot(out0,out1, x,y,c0,c1) \
914 __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
915 __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
916 __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
917 __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
918 __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
919 __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
920
921 /* out = in << 12 (in 16-bit, out 32-bit) */
922 #define dct_widen(out, in) \
923 __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
924 __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
925
926 /* wide add */
927 #define dct_wadd(out, a, b) \
928 __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
929 __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
930
931 /* wide sub */
932 #define dct_wsub(out, a, b) \
933 __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
934 __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
935
936 /* butterfly a/b, add bias, then shift by "s" and pack */
937 #define dct_bfly32o(out0, out1, a,b,bias,s) \
938 { \
939 __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
940 __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
941 dct_wadd(sum, abiased, b); \
942 dct_wsub(dif, abiased, b); \
943 out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
944 out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
945 }
946
947 /* 8-bit interleave step (for transposes) */
948 #define dct_interleave8(a, b) \
949 tmp = a; \
950 a = _mm_unpacklo_epi8(a, b); \
951 b = _mm_unpackhi_epi8(tmp, b)
952
953 /* 16-bit interleave step (for transposes) */
954 #define dct_interleave16(a, b) \
955 tmp = a; \
956 a = _mm_unpacklo_epi16(a, b); \
957 b = _mm_unpackhi_epi16(tmp, b)
958
959 #define dct_pass(bias,shift) \
960 { \
961 /* even part */ \
962 dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
963 __m128i sum04 = _mm_add_epi16(row0, row4); \
964 __m128i dif04 = _mm_sub_epi16(row0, row4); \
965 dct_widen(t0e, sum04); \
966 dct_widen(t1e, dif04); \
967 dct_wadd(x0, t0e, t3e); \
968 dct_wsub(x3, t0e, t3e); \
969 dct_wadd(x1, t1e, t2e); \
970 dct_wsub(x2, t1e, t2e); \
971 /* odd part */ \
972 dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
973 dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
974 __m128i sum17 = _mm_add_epi16(row1, row7); \
975 __m128i sum35 = _mm_add_epi16(row3, row5); \
976 dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
977 dct_wadd(x4, y0o, y4o); \
978 dct_wadd(x5, y1o, y5o); \
979 dct_wadd(x6, y2o, y5o); \
980 dct_wadd(x7, y3o, y4o); \
981 dct_bfly32o(row0,row7, x0,x7,bias,shift); \
982 dct_bfly32o(row1,row6, x1,x6,bias,shift); \
983 dct_bfly32o(row2,row5, x2,x5,bias,shift); \
984 dct_bfly32o(row3,row4, x3,x4,bias,shift); \
985 }
986
987 __m128i rot0_0 = dct_const(RJPEG_F2F(0.5411961f), RJPEG_F2F(0.5411961f) + RJPEG_F2F(-1.847759065f));
988 __m128i rot0_1 = dct_const(RJPEG_F2F(0.5411961f) + RJPEG_F2F( 0.765366865f), RJPEG_F2F(0.5411961f));
989 __m128i rot1_0 = dct_const(RJPEG_F2F(1.175875602f) + RJPEG_F2F(-0.899976223f), RJPEG_F2F(1.175875602f));
990 __m128i rot1_1 = dct_const(RJPEG_F2F(1.175875602f), RJPEG_F2F(1.175875602f) + RJPEG_F2F(-2.562915447f));
991 __m128i rot2_0 = dct_const(RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 0.298631336f), RJPEG_F2F(-1.961570560f));
992 __m128i rot2_1 = dct_const(RJPEG_F2F(-1.961570560f), RJPEG_F2F(-1.961570560f) + RJPEG_F2F( 3.072711026f));
993 __m128i rot3_0 = dct_const(RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 2.053119869f), RJPEG_F2F(-0.390180644f));
994 __m128i rot3_1 = dct_const(RJPEG_F2F(-0.390180644f), RJPEG_F2F(-0.390180644f) + RJPEG_F2F( 1.501321110f));
995
996 /* rounding biases in column/row passes, see rjpeg_idct_block for explanation. */
997 __m128i bias_0 = _mm_set1_epi32(512);
998 __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
999
1000 /* load */
1001 row0 = _mm_load_si128((const __m128i *) (data + 0*8));
1002 row1 = _mm_load_si128((const __m128i *) (data + 1*8));
1003 row2 = _mm_load_si128((const __m128i *) (data + 2*8));
1004 row3 = _mm_load_si128((const __m128i *) (data + 3*8));
1005 row4 = _mm_load_si128((const __m128i *) (data + 4*8));
1006 row5 = _mm_load_si128((const __m128i *) (data + 5*8));
1007 row6 = _mm_load_si128((const __m128i *) (data + 6*8));
1008 row7 = _mm_load_si128((const __m128i *) (data + 7*8));
1009
1010 /* column pass */
1011 dct_pass(bias_0, 10);
1012
1013 {
1014 /* 16bit 8x8 transpose pass 1 */
1015 dct_interleave16(row0, row4);
1016 dct_interleave16(row1, row5);
1017 dct_interleave16(row2, row6);
1018 dct_interleave16(row3, row7);
1019
1020 /* transpose pass 2 */
1021 dct_interleave16(row0, row2);
1022 dct_interleave16(row1, row3);
1023 dct_interleave16(row4, row6);
1024 dct_interleave16(row5, row7);
1025
1026 /* transpose pass 3 */
1027 dct_interleave16(row0, row1);
1028 dct_interleave16(row2, row3);
1029 dct_interleave16(row4, row5);
1030 dct_interleave16(row6, row7);
1031 }
1032
1033 /* row pass */
1034 dct_pass(bias_1, 17);
1035
1036 {
1037 /* pack */
1038 __m128i p0 = _mm_packus_epi16(row0, row1); /* a0a1a2a3...a7b0b1b2b3...b7 */
1039 __m128i p1 = _mm_packus_epi16(row2, row3);
1040 __m128i p2 = _mm_packus_epi16(row4, row5);
1041 __m128i p3 = _mm_packus_epi16(row6, row7);
1042
1043 /* 8bit 8x8 transpose pass 1 */
1044 dct_interleave8(p0, p2); /* a0e0a1e1... */
1045 dct_interleave8(p1, p3); /* c0g0c1g1... */
1046
1047 /* transpose pass 2 */
1048 dct_interleave8(p0, p1); /* a0c0e0g0... */
1049 dct_interleave8(p2, p3); /* b0d0f0h0... */
1050
1051 /* transpose pass 3 */
1052 dct_interleave8(p0, p2); /* a0b0c0d0... */
1053 dct_interleave8(p1, p3); /* a4b4c4d4... */
1054
1055 /* store */
1056 _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
1057 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
1058 _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
1059 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
1060 _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
1061 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
1062 _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
1063 _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
1064 }
1065
1066 #undef dct_const
1067 #undef dct_rot
1068 #undef dct_widen
1069 #undef dct_wadd
1070 #undef dct_wsub
1071 #undef dct_bfly32o
1072 #undef dct_interleave8
1073 #undef dct_interleave16
1074 #undef dct_pass
1075 }
1076
1077 #endif
1078
1079 #ifdef RJPEG_NEON
1080
1081 /* NEON integer IDCT. should produce bit-identical
1082 * results to the generic C version. */
rjpeg_idct_simd(uint8_t * out,int out_stride,short data[64])1083 static void rjpeg_idct_simd(uint8_t *out, int out_stride, short data[64])
1084 {
1085 int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
1086
1087 int16x4_t rot0_0 = vdup_n_s16(RJPEG_F2F(0.5411961f));
1088 int16x4_t rot0_1 = vdup_n_s16(RJPEG_F2F(-1.847759065f));
1089 int16x4_t rot0_2 = vdup_n_s16(RJPEG_F2F( 0.765366865f));
1090 int16x4_t rot1_0 = vdup_n_s16(RJPEG_F2F( 1.175875602f));
1091 int16x4_t rot1_1 = vdup_n_s16(RJPEG_F2F(-0.899976223f));
1092 int16x4_t rot1_2 = vdup_n_s16(RJPEG_F2F(-2.562915447f));
1093 int16x4_t rot2_0 = vdup_n_s16(RJPEG_F2F(-1.961570560f));
1094 int16x4_t rot2_1 = vdup_n_s16(RJPEG_F2F(-0.390180644f));
1095 int16x4_t rot3_0 = vdup_n_s16(RJPEG_F2F( 0.298631336f));
1096 int16x4_t rot3_1 = vdup_n_s16(RJPEG_F2F( 2.053119869f));
1097 int16x4_t rot3_2 = vdup_n_s16(RJPEG_F2F( 3.072711026f));
1098 int16x4_t rot3_3 = vdup_n_s16(RJPEG_F2F( 1.501321110f));
1099
1100 #define dct_long_mul(out, inq, coeff) \
1101 int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
1102 int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
1103
1104 #define dct_long_mac(out, acc, inq, coeff) \
1105 int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
1106 int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
1107
1108 #define dct_widen(out, inq) \
1109 int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
1110 int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
1111
1112 /* wide add */
1113 #define dct_wadd(out, a, b) \
1114 int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
1115 int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
1116
1117 /* wide sub */
1118 #define dct_wsub(out, a, b) \
1119 int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
1120 int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
1121
1122 /* butterfly a/b, then shift using "shiftop" by "s" and pack */
1123 #define dct_bfly32o(out0,out1, a,b,shiftop,s) \
1124 { \
1125 dct_wadd(sum, a, b); \
1126 dct_wsub(dif, a, b); \
1127 out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
1128 out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
1129 }
1130
1131 #define dct_pass(shiftop, shift) \
1132 { \
1133 /* even part */ \
1134 int16x8_t sum26 = vaddq_s16(row2, row6); \
1135 dct_long_mul(p1e, sum26, rot0_0); \
1136 dct_long_mac(t2e, p1e, row6, rot0_1); \
1137 dct_long_mac(t3e, p1e, row2, rot0_2); \
1138 int16x8_t sum04 = vaddq_s16(row0, row4); \
1139 int16x8_t dif04 = vsubq_s16(row0, row4); \
1140 dct_widen(t0e, sum04); \
1141 dct_widen(t1e, dif04); \
1142 dct_wadd(x0, t0e, t3e); \
1143 dct_wsub(x3, t0e, t3e); \
1144 dct_wadd(x1, t1e, t2e); \
1145 dct_wsub(x2, t1e, t2e); \
1146 /* odd part */ \
1147 int16x8_t sum15 = vaddq_s16(row1, row5); \
1148 int16x8_t sum17 = vaddq_s16(row1, row7); \
1149 int16x8_t sum35 = vaddq_s16(row3, row5); \
1150 int16x8_t sum37 = vaddq_s16(row3, row7); \
1151 int16x8_t sumodd = vaddq_s16(sum17, sum35); \
1152 dct_long_mul(p5o, sumodd, rot1_0); \
1153 dct_long_mac(p1o, p5o, sum17, rot1_1); \
1154 dct_long_mac(p2o, p5o, sum35, rot1_2); \
1155 dct_long_mul(p3o, sum37, rot2_0); \
1156 dct_long_mul(p4o, sum15, rot2_1); \
1157 dct_wadd(sump13o, p1o, p3o); \
1158 dct_wadd(sump24o, p2o, p4o); \
1159 dct_wadd(sump23o, p2o, p3o); \
1160 dct_wadd(sump14o, p1o, p4o); \
1161 dct_long_mac(x4, sump13o, row7, rot3_0); \
1162 dct_long_mac(x5, sump24o, row5, rot3_1); \
1163 dct_long_mac(x6, sump23o, row3, rot3_2); \
1164 dct_long_mac(x7, sump14o, row1, rot3_3); \
1165 dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
1166 dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
1167 dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
1168 dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
1169 }
1170
1171 /* load */
1172 row0 = vld1q_s16(data + 0*8);
1173 row1 = vld1q_s16(data + 1*8);
1174 row2 = vld1q_s16(data + 2*8);
1175 row3 = vld1q_s16(data + 3*8);
1176 row4 = vld1q_s16(data + 4*8);
1177 row5 = vld1q_s16(data + 5*8);
1178 row6 = vld1q_s16(data + 6*8);
1179 row7 = vld1q_s16(data + 7*8);
1180
1181 /* add DC bias */
1182 row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
1183
1184 /* column pass */
1185 dct_pass(vrshrn_n_s32, 10);
1186
1187 /* 16bit 8x8 transpose */
1188 {
1189 /* these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
1190 * whether compilers actually get this is another story, sadly. */
1191 #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
1192 #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
1193 #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
1194
1195 /* pass 1 */
1196 dct_trn16(row0, row1); /* a0b0a2b2a4b4a6b6 */
1197 dct_trn16(row2, row3);
1198 dct_trn16(row4, row5);
1199 dct_trn16(row6, row7);
1200
1201 /* pass 2 */
1202 dct_trn32(row0, row2); /* a0b0c0d0a4b4c4d4 */
1203 dct_trn32(row1, row3);
1204 dct_trn32(row4, row6);
1205 dct_trn32(row5, row7);
1206
1207 /* pass 3 */
1208 dct_trn64(row0, row4); /* a0b0c0d0e0f0g0h0 */
1209 dct_trn64(row1, row5);
1210 dct_trn64(row2, row6);
1211 dct_trn64(row3, row7);
1212
1213 #undef dct_trn16
1214 #undef dct_trn32
1215 #undef dct_trn64
1216 }
1217
1218 /* row pass
1219 * vrshrn_n_s32 only supports shifts up to 16, we need
1220 * 17. so do a non-rounding shift of 16 first then follow
1221 * up with a rounding shift by 1. */
1222 dct_pass(vshrn_n_s32, 16);
1223
1224 {
1225 /* pack and round */
1226 uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
1227 uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
1228 uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
1229 uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
1230 uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
1231 uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
1232 uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
1233 uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
1234
1235 /* again, these can translate into one instruction, but often don't. */
1236 #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
1237 #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
1238 #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
1239
1240 /* sadly can't use interleaved stores here since we only write
1241 * 8 bytes to each scan line! */
1242
1243 /* 8x8 8-bit transpose pass 1 */
1244 dct_trn8_8(p0, p1);
1245 dct_trn8_8(p2, p3);
1246 dct_trn8_8(p4, p5);
1247 dct_trn8_8(p6, p7);
1248
1249 /* pass 2 */
1250 dct_trn8_16(p0, p2);
1251 dct_trn8_16(p1, p3);
1252 dct_trn8_16(p4, p6);
1253 dct_trn8_16(p5, p7);
1254
1255 /* pass 3 */
1256 dct_trn8_32(p0, p4);
1257 dct_trn8_32(p1, p5);
1258 dct_trn8_32(p2, p6);
1259 dct_trn8_32(p3, p7);
1260
1261 /* store */
1262 vst1_u8(out, p0);
1263 out += out_stride;
1264 vst1_u8(out, p1);
1265 out += out_stride;
1266 vst1_u8(out, p2);
1267 out += out_stride;
1268 vst1_u8(out, p3);
1269 out += out_stride;
1270 vst1_u8(out, p4);
1271 out += out_stride;
1272 vst1_u8(out, p5);
1273 out += out_stride;
1274 vst1_u8(out, p6);
1275 out += out_stride;
1276 vst1_u8(out, p7);
1277
1278 #undef dct_trn8_8
1279 #undef dct_trn8_16
1280 #undef dct_trn8_32
1281 }
1282
1283 #undef dct_long_mul
1284 #undef dct_long_mac
1285 #undef dct_widen
1286 #undef dct_wadd
1287 #undef dct_wsub
1288 #undef dct_bfly32o
1289 #undef dct_pass
1290 }
1291
1292 #endif /* RJPEG_NEON */
1293
rjpeg_get_marker(rjpeg_jpeg * j)1294 static uint8_t rjpeg_get_marker(rjpeg_jpeg *j)
1295 {
1296 uint8_t x;
1297
1298 if (j->marker != RJPEG_MARKER_NONE)
1299 {
1300 x = j->marker;
1301 j->marker = RJPEG_MARKER_NONE;
1302 return x;
1303 }
1304
1305 x = rjpeg_get8(j->s);
1306 if (x != 0xff)
1307 return RJPEG_MARKER_NONE;
1308 while (x == 0xff)
1309 x = rjpeg_get8(j->s);
1310 return x;
1311 }
1312
1313 /* after a restart interval, rjpeg_jpeg_reset the entropy decoder and
1314 * the dc prediction
1315 */
rjpeg_jpeg_reset(rjpeg_jpeg * j)1316 static void rjpeg_jpeg_reset(rjpeg_jpeg *j)
1317 {
1318 j->code_bits = 0;
1319 j->code_buffer = 0;
1320 j->nomore = 0;
1321 j->img_comp[0].dc_pred = 0;
1322 j->img_comp[1].dc_pred = 0;
1323 j->img_comp[2].dc_pred = 0;
1324 j->marker = RJPEG_MARKER_NONE;
1325 j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
1326 j->eob_run = 0;
1327
1328 /* no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
1329 * since we don't even allow 1<<30 pixels */
1330 }
1331
rjpeg_parse_entropy_coded_data(rjpeg_jpeg * z)1332 static int rjpeg_parse_entropy_coded_data(rjpeg_jpeg *z)
1333 {
1334 rjpeg_jpeg_reset(z);
1335
1336 if (z->scan_n == 1)
1337 {
1338 int i, j;
1339 int n = z->order[0];
1340 int w = (z->img_comp[n].x+7) >> 3;
1341 int h = (z->img_comp[n].y+7) >> 3;
1342
1343 /* non-interleaved data, we just need to process one block at a time,
1344 * in trivial scanline order
1345 * number of blocks to do just depends on how many actual "pixels" this
1346 * component has, independent of interleaved MCU blocking and such */
1347
1348 if (z->progressive)
1349 {
1350 for (j = 0; j < h; ++j)
1351 {
1352 for (i = 0; i < w; ++i)
1353 {
1354 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1355
1356 if (z->spec_start == 0)
1357 {
1358 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1359 return 0;
1360 }
1361 else
1362 {
1363 int ha = z->img_comp[n].ha;
1364 if (!rjpeg_jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
1365 return 0;
1366 }
1367
1368 /* every data block is an MCU, so countdown the restart interval */
1369 if (--z->todo <= 0)
1370 {
1371 if (z->code_bits < 24)
1372 rjpeg_grow_buffer_unsafe(z);
1373
1374 if (!RJPEG_RESTART(z->marker))
1375 return 1;
1376 rjpeg_jpeg_reset(z);
1377 }
1378 }
1379 }
1380 }
1381 else
1382 {
1383 RJPEG_SIMD_ALIGN(short, data[64]);
1384
1385 for (j = 0; j < h; ++j)
1386 {
1387 for (i = 0; i < w; ++i)
1388 {
1389 int ha = z->img_comp[n].ha;
1390 if (!rjpeg_jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd,
1391 z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq]))
1392 return 0;
1393
1394 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1395 z->img_comp[n].w2, data);
1396
1397 /* every data block is an MCU, so countdown the restart interval */
1398 if (--z->todo <= 0)
1399 {
1400 if (z->code_bits < 24)
1401 rjpeg_grow_buffer_unsafe(z);
1402
1403 /* if it's NOT a restart, then just bail,
1404 * so we get corrupt data rather than no data */
1405 if (!RJPEG_RESTART(z->marker))
1406 return 1;
1407 rjpeg_jpeg_reset(z);
1408 }
1409 }
1410 }
1411 }
1412 }
1413 else
1414 {
1415 /* interleaved */
1416 int i,j,k,x,y;
1417
1418 if (z->progressive)
1419 {
1420 for (j = 0; j < z->img_mcu_y; ++j)
1421 {
1422 for (i = 0; i < z->img_mcu_x; ++i)
1423 {
1424 /* scan an interleaved MCU... process scan_n components in order */
1425 for (k = 0; k < z->scan_n; ++k)
1426 {
1427 int n = z->order[k];
1428 /* scan out an MCU's worth of this component; that's just determined
1429 * by the basic H and V specified for the component */
1430 for (y = 0; y < z->img_comp[n].v; ++y)
1431 {
1432 for (x = 0; x < z->img_comp[n].h; ++x)
1433 {
1434 int x2 = (i*z->img_comp[n].h + x);
1435 int y2 = (j*z->img_comp[n].v + y);
1436 short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
1437 if (!rjpeg_jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
1438 return 0;
1439 }
1440 }
1441 }
1442
1443 /* after all interleaved components, that's an interleaved MCU,
1444 * so now count down the restart interval */
1445 if (--z->todo <= 0)
1446 {
1447 if (z->code_bits < 24)
1448 rjpeg_grow_buffer_unsafe(z);
1449 if (!RJPEG_RESTART(z->marker))
1450 return 1;
1451 rjpeg_jpeg_reset(z);
1452 }
1453 }
1454 }
1455 }
1456 else
1457 {
1458 RJPEG_SIMD_ALIGN(short, data[64]);
1459
1460 for (j = 0; j < z->img_mcu_y; ++j)
1461 {
1462 for (i = 0; i < z->img_mcu_x; ++i)
1463 {
1464 /* scan an interleaved MCU... process scan_n components in order */
1465 for (k = 0; k < z->scan_n; ++k)
1466 {
1467 int n = z->order[k];
1468 /* scan out an MCU's worth of this component; that's just determined
1469 * by the basic H and V specified for the component */
1470 for (y = 0; y < z->img_comp[n].v; ++y)
1471 {
1472 for (x = 0; x < z->img_comp[n].h; ++x)
1473 {
1474 int x2 = (i*z->img_comp[n].h + x)*8;
1475 int y2 = (j*z->img_comp[n].v + y)*8;
1476 int ha = z->img_comp[n].ha;
1477
1478 if (!rjpeg_jpeg_decode_block(z, data,
1479 z->huff_dc+z->img_comp[n].hd,
1480 z->huff_ac+ha, z->fast_ac[ha],
1481 n, z->dequant[z->img_comp[n].tq]))
1482 return 0;
1483
1484 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2,
1485 z->img_comp[n].w2, data);
1486 }
1487 }
1488 }
1489
1490 /* after all interleaved components, that's an interleaved MCU,
1491 * so now count down the restart interval */
1492 if (--z->todo <= 0)
1493 {
1494 if (z->code_bits < 24)
1495 rjpeg_grow_buffer_unsafe(z);
1496 if (!RJPEG_RESTART(z->marker))
1497 return 1;
1498 rjpeg_jpeg_reset(z);
1499 }
1500 }
1501 }
1502 }
1503 }
1504
1505 return 1;
1506 }
1507
rjpeg_jpeg_dequantize(short * data,uint8_t * dequant)1508 static void rjpeg_jpeg_dequantize(short *data, uint8_t *dequant)
1509 {
1510 int i;
1511 for (i = 0; i < 64; ++i)
1512 data[i] *= dequant[i];
1513 }
1514
rjpeg_jpeg_finish(rjpeg_jpeg * z)1515 static void rjpeg_jpeg_finish(rjpeg_jpeg *z)
1516 {
1517 int i,j,n;
1518
1519 if (!z->progressive)
1520 return;
1521
1522 /* dequantize and IDCT the data */
1523 for (n = 0; n < z->s->img_n; ++n)
1524 {
1525 int w = (z->img_comp[n].x+7) >> 3;
1526 int h = (z->img_comp[n].y+7) >> 3;
1527 for (j = 0; j < h; ++j)
1528 {
1529 for (i = 0; i < w; ++i)
1530 {
1531 short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
1532 rjpeg_jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
1533 z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8,
1534 z->img_comp[n].w2, data);
1535 }
1536 }
1537 }
1538 }
1539
rjpeg_process_marker(rjpeg_jpeg * z,int m)1540 static int rjpeg_process_marker(rjpeg_jpeg *z, int m)
1541 {
1542 int L;
1543 switch (m)
1544 {
1545 case RJPEG_MARKER_NONE: /* no marker found */
1546 /* Expected marker. Corrupt JPEG? */
1547 return 0;
1548
1549 case 0xDD: /* DRI - specify restart interval */
1550
1551 /* Bad DRI length. Corrupt JPEG? */
1552 if (RJPEG_GET16BE(z->s) != 4)
1553 return 0;
1554
1555 z->restart_interval = RJPEG_GET16BE(z->s);
1556 return 1;
1557
1558 case 0xDB: /* DQT - define quantization table */
1559 L = RJPEG_GET16BE(z->s)-2;
1560 while (L > 0)
1561 {
1562 int q = rjpeg_get8(z->s);
1563 int p = q >> 4;
1564 int t = q & 15,i;
1565
1566 /* Bad DQT type. Corrupt JPEG? */
1567 if (p != 0)
1568 return 0;
1569
1570 /* Bad DQT table. Corrupt JPEG? */
1571 if (t > 3)
1572 return 0;
1573
1574 for (i = 0; i < 64; ++i)
1575 z->dequant[t][rjpeg_jpeg_dezigzag[i]] = rjpeg_get8(z->s);
1576 L -= 65;
1577 }
1578 return L == 0;
1579
1580 case 0xC4: /* DHT - define huffman table */
1581 L = RJPEG_GET16BE(z->s)-2;
1582 while (L > 0)
1583 {
1584 int sizes[16],i,n = 0;
1585 uint8_t *v = NULL;
1586 int q = rjpeg_get8(z->s);
1587 int tc = q >> 4;
1588 int th = q & 15;
1589
1590 /* Bad DHT header. Corrupt JPEG? */
1591 if (tc > 1 || th > 3)
1592 return 0;
1593
1594 for (i = 0; i < 16; ++i)
1595 {
1596 sizes[i] = rjpeg_get8(z->s);
1597 n += sizes[i];
1598 }
1599 L -= 17;
1600
1601 if (tc == 0)
1602 {
1603 if (!rjpeg_build_huffman(z->huff_dc+th, sizes))
1604 return 0;
1605 v = z->huff_dc[th].values;
1606 }
1607 else
1608 {
1609 if (!rjpeg_build_huffman(z->huff_ac+th, sizes))
1610 return 0;
1611 v = z->huff_ac[th].values;
1612 }
1613 for (i = 0; i < n; ++i)
1614 v[i] = rjpeg_get8(z->s);
1615 if (tc != 0)
1616 rjpeg_build_fast_ac(z->fast_ac[th], z->huff_ac + th);
1617 L -= n;
1618 }
1619 return L == 0;
1620 }
1621
1622 /* check for comment block or APP blocks */
1623 if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE)
1624 {
1625 int n = RJPEG_GET16BE(z->s)-2;
1626
1627 if (n < 0)
1628 z->s->img_buffer = z->s->img_buffer_end;
1629 else
1630 z->s->img_buffer += n;
1631
1632 return 1;
1633 }
1634 return 0;
1635 }
1636
1637 /* after we see SOS */
rjpeg_process_scan_header(rjpeg_jpeg * z)1638 static int rjpeg_process_scan_header(rjpeg_jpeg *z)
1639 {
1640 int i;
1641 int aa;
1642 int Ls = RJPEG_GET16BE(z->s);
1643
1644 z->scan_n = rjpeg_get8(z->s);
1645
1646 /* Bad SOS component count. Corrupt JPEG? */
1647 if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n)
1648 return 0;
1649
1650 /* Bad SOS length. Corrupt JPEG? */
1651 if (Ls != 6+2*z->scan_n)
1652 return 0;
1653
1654 for (i = 0; i < z->scan_n; ++i)
1655 {
1656 int which;
1657 int id = rjpeg_get8(z->s);
1658 int q = rjpeg_get8(z->s);
1659
1660 for (which = 0; which < z->s->img_n; ++which)
1661 if (z->img_comp[which].id == id)
1662 break;
1663 if (which == z->s->img_n)
1664 return 0; /* no match */
1665
1666 /* Bad DC huff. Corrupt JPEG? */
1667 z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3)
1668 return 0;
1669
1670 /* Bad AC huff. Corrupt JPEG? */
1671 z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3)
1672 return 0;
1673
1674 z->order[i] = which;
1675 }
1676
1677 z->spec_start = rjpeg_get8(z->s);
1678 z->spec_end = rjpeg_get8(z->s); /* should be 63, but might be 0 */
1679 aa = rjpeg_get8(z->s);
1680 z->succ_high = (aa >> 4);
1681 z->succ_low = (aa & 15);
1682
1683 if (z->progressive)
1684 {
1685 /* Bad SOS. Corrupt JPEG? */
1686 if ( z->spec_start > 63 ||
1687 z->spec_end > 63 ||
1688 z->spec_start > z->spec_end ||
1689 z->succ_high > 13 ||
1690 z->succ_low > 13)
1691 return 0;
1692 }
1693 else
1694 {
1695 /* Bad SOS. Corrupt JPEG? */
1696 if (z->spec_start != 0)
1697 return 0;
1698 if (z->succ_high != 0 || z->succ_low != 0)
1699 return 0;
1700
1701 z->spec_end = 63;
1702 }
1703
1704 return 1;
1705 }
1706
rjpeg_process_frame_header(rjpeg_jpeg * z,int scan)1707 static int rjpeg_process_frame_header(rjpeg_jpeg *z, int scan)
1708 {
1709 rjpeg_context *s = z->s;
1710 int Lf,p,i,q, h_max=1,v_max=1,c;
1711 Lf = RJPEG_GET16BE(s);
1712
1713 /* JPEG */
1714
1715 /* Bad SOF len. Corrupt JPEG? */
1716 if (Lf < 11)
1717 return 0;
1718
1719 p = rjpeg_get8(s);
1720
1721 /* JPEG baseline */
1722
1723 /* Only 8-bit. JPEG format not supported? */
1724 if (p != 8)
1725 return 0;
1726
1727 s->img_y = RJPEG_GET16BE(s);
1728
1729 /* Legal, but we don't handle it--but neither does IJG */
1730
1731 /* No header height, JPEG format not supported? */
1732 if (s->img_y == 0)
1733 return 0;
1734
1735 s->img_x = RJPEG_GET16BE(s);
1736
1737 /* No header width. Corrupt JPEG? */
1738 if (s->img_x == 0)
1739 return 0;
1740
1741 c = rjpeg_get8(s);
1742
1743 /* JFIF requires */
1744
1745 /* Bad component count. Corrupt JPEG? */
1746 if (c != 3 && c != 1)
1747 return 0;
1748
1749 s->img_n = c;
1750
1751 for (i = 0; i < c; ++i)
1752 {
1753 z->img_comp[i].data = NULL;
1754 z->img_comp[i].linebuf = NULL;
1755 }
1756
1757 /* Bad SOF length. Corrupt JPEG? */
1758 if (Lf != 8+3*s->img_n)
1759 return 0;
1760
1761 for (i = 0; i < s->img_n; ++i)
1762 {
1763 z->img_comp[i].id = rjpeg_get8(s);
1764 if (z->img_comp[i].id != i+1) /* JFIF requires */
1765 if (z->img_comp[i].id != i) /* some version of jpegtran outputs non-JFIF-compliant files! */
1766 return 0;
1767
1768 q = rjpeg_get8(s);
1769 z->img_comp[i].h = (q >> 4);
1770
1771 /* Bad H. Corrupt JPEG? */
1772 if (!z->img_comp[i].h || z->img_comp[i].h > 4)
1773 return 0;
1774
1775 z->img_comp[i].v = q & 15;
1776
1777 /* Bad V. Corrupt JPEG? */
1778 if (!z->img_comp[i].v || z->img_comp[i].v > 4)
1779 return 0;
1780
1781 z->img_comp[i].tq = rjpeg_get8(s);
1782
1783 /* Bad TQ. Corrupt JPEG? */
1784 if (z->img_comp[i].tq > 3)
1785 return 0;
1786 }
1787
1788 if (scan != RJPEG_SCAN_LOAD)
1789 return 1;
1790
1791 /* Image too large to decode? */
1792 if ((1 << 30) / s->img_x / s->img_n < s->img_y)
1793 return 0;
1794
1795 for (i = 0; i < s->img_n; ++i)
1796 {
1797 if (z->img_comp[i].h > h_max)
1798 h_max = z->img_comp[i].h;
1799 if (z->img_comp[i].v > v_max)
1800 v_max = z->img_comp[i].v;
1801 }
1802
1803 /* compute interleaved MCU info */
1804 z->img_h_max = h_max;
1805 z->img_v_max = v_max;
1806 z->img_mcu_w = h_max * 8;
1807 z->img_mcu_h = v_max * 8;
1808 z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
1809 z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
1810
1811 if (z->progressive)
1812 {
1813 for (i = 0; i < s->img_n; ++i)
1814 {
1815 /* number of effective pixels (e.g. for non-interleaved MCU) */
1816 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1817 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1818
1819 /* to simplify generation, we'll allocate enough memory to decode
1820 * the bogus oversized data from using interleaved MCUs and their
1821 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1822 * discard the extra data until colorspace conversion */
1823 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1824 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1825 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1826
1827 /* Out of memory? */
1828 if (!z->img_comp[i].raw_data)
1829 {
1830 for (--i; i >= 0; --i)
1831 {
1832 free(z->img_comp[i].raw_data);
1833 z->img_comp[i].data = NULL;
1834 }
1835
1836 return 0;
1837 }
1838
1839 /* align blocks for IDCT using MMX/SSE */
1840 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1841 z->img_comp[i].linebuf = NULL;
1842 z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
1843 z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
1844 z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w *
1845 z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
1846 z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
1847 }
1848 }
1849 else
1850 {
1851 for (i = 0; i < s->img_n; ++i)
1852 {
1853 /* number of effective pixels (e.g. for non-interleaved MCU) */
1854 z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
1855 z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
1856
1857 /* to simplify generation, we'll allocate enough memory to decode
1858 * the bogus oversized data from using interleaved MCUs and their
1859 * big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1860 * discard the extra data until colorspace conversion */
1861 z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
1862 z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
1863 z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
1864
1865 /* Out of memory? */
1866 if (!z->img_comp[i].raw_data)
1867 {
1868 for (--i; i >= 0; --i)
1869 {
1870 free(z->img_comp[i].raw_data);
1871 z->img_comp[i].data = NULL;
1872 }
1873 }
1874
1875 /* align blocks for IDCT using MMX/SSE */
1876 z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
1877 z->img_comp[i].linebuf = NULL;
1878 z->img_comp[i].coeff = 0;
1879 z->img_comp[i].raw_coeff = 0;
1880 }
1881 }
1882
1883 return 1;
1884 }
1885
rjpeg_decode_jpeg_header(rjpeg_jpeg * z,int scan)1886 static int rjpeg_decode_jpeg_header(rjpeg_jpeg *z, int scan)
1887 {
1888 int m;
1889 z->marker = RJPEG_MARKER_NONE; /* initialize cached marker to empty */
1890 m = rjpeg_get_marker(z);
1891
1892 /* No SOI. Corrupt JPEG? */
1893 if (m != JPEG_MARKER_SOI)
1894 return 0;
1895
1896 if (scan == RJPEG_SCAN_TYPE)
1897 return 1;
1898
1899 m = rjpeg_get_marker(z);
1900 while (!RJPEG_SOF(m))
1901 {
1902 if (!rjpeg_process_marker(z,m))
1903 return 0;
1904 m = rjpeg_get_marker(z);
1905 while (m == RJPEG_MARKER_NONE)
1906 {
1907 /* some files have extra padding after their blocks, so ok, we'll scan */
1908
1909 /* No SOF. Corrupt JPEG? */
1910 if (RJPEG_AT_EOF(z->s))
1911 return 0;
1912
1913 m = rjpeg_get_marker(z);
1914 }
1915 }
1916 z->progressive = RJPEG_SOF_PROGRESSIVE(m);
1917 if (!rjpeg_process_frame_header(z, scan))
1918 return 0;
1919 return 1;
1920 }
1921
1922 /* decode image to YCbCr format */
rjpeg_decode_jpeg_image(rjpeg_jpeg * j)1923 static int rjpeg_decode_jpeg_image(rjpeg_jpeg *j)
1924 {
1925 int m;
1926 for (m = 0; m < 4; m++)
1927 {
1928 j->img_comp[m].raw_data = NULL;
1929 j->img_comp[m].raw_coeff = NULL;
1930 }
1931 j->restart_interval = 0;
1932 if (!rjpeg_decode_jpeg_header(j, RJPEG_SCAN_LOAD))
1933 return 0;
1934 m = rjpeg_get_marker(j);
1935
1936 while (m != JPEG_MARKER_EOI)
1937 {
1938 if (m == JPEG_MARKER_SOS)
1939 {
1940 if (!rjpeg_process_scan_header(j))
1941 return 0;
1942 if (!rjpeg_parse_entropy_coded_data(j))
1943 return 0;
1944
1945 if (j->marker == RJPEG_MARKER_NONE )
1946 {
1947 /* handle 0s at the end of image data from IP Kamera 9060 */
1948
1949 while (!RJPEG_AT_EOF(j->s))
1950 {
1951 int x = rjpeg_get8(j->s);
1952 if (x == 255)
1953 {
1954 j->marker = rjpeg_get8(j->s);
1955 break;
1956 }
1957 else if (x != 0) /* Junk before marker. Corrupt JPEG? */
1958 return 0;
1959 }
1960
1961 /* if we reach eof without hitting a marker,
1962 * rjpeg_get_marker() below will fail and we'll eventually return 0 */
1963 }
1964 }
1965 else
1966 {
1967 if (!rjpeg_process_marker(j, m))
1968 return 0;
1969 }
1970 m = rjpeg_get_marker(j);
1971 }
1972
1973 if (j->progressive)
1974 rjpeg_jpeg_finish(j);
1975 return 1;
1976 }
1977
1978 /* static jfif-centered resampling (across block boundaries) */
1979
rjpeg_resample_row_1(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1980 static uint8_t *rjpeg_resample_row_1(uint8_t *out, uint8_t *in_near,
1981 uint8_t *in_far, int w, int hs)
1982 {
1983 (void)out;
1984 (void)in_far;
1985 (void)w;
1986 (void)hs;
1987 return in_near;
1988 }
1989
rjpeg_resample_row_v_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)1990 static uint8_t* rjpeg_resample_row_v_2(uint8_t *out, uint8_t *in_near,
1991 uint8_t *in_far, int w, int hs)
1992 {
1993 /* need to generate two samples vertically for every one in input */
1994 int i;
1995 (void)hs;
1996 for (i = 0; i < w; ++i)
1997 out[i] = RJPEG_DIV4(3*in_near[i] + in_far[i] + 2);
1998 return out;
1999 }
2000
rjpeg_resample_row_h_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2001 static uint8_t* rjpeg_resample_row_h_2(uint8_t *out, uint8_t *in_near,
2002 uint8_t *in_far, int w, int hs)
2003 {
2004 /* need to generate two samples horizontally for every one in input */
2005 int i;
2006 uint8_t *input = in_near;
2007
2008 if (w == 1)
2009 {
2010 /* if only one sample, can't do any interpolation */
2011 out[0] = out[1] = input[0];
2012 return out;
2013 }
2014
2015 out[0] = input[0];
2016 out[1] = RJPEG_DIV4(input[0]*3 + input[1] + 2);
2017
2018 for (i=1; i < w-1; ++i)
2019 {
2020 int n = 3 * input[i] + 2;
2021 out[i*2+0] = RJPEG_DIV4(n+input[i-1]);
2022 out[i*2+1] = RJPEG_DIV4(n+input[i+1]);
2023 }
2024 out[i*2+0] = RJPEG_DIV4(input[w-2]*3 + input[w-1] + 2);
2025 out[i*2+1] = input[w-1];
2026
2027 (void)in_far;
2028 (void)hs;
2029
2030 return out;
2031 }
2032
rjpeg_resample_row_hv_2(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2033 static uint8_t *rjpeg_resample_row_hv_2(uint8_t *out, uint8_t *in_near,
2034 uint8_t *in_far, int w, int hs)
2035 {
2036 /* need to generate 2x2 samples for every one in input */
2037 int i,t0,t1;
2038 if (w == 1)
2039 {
2040 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2041 return out;
2042 }
2043
2044 t1 = 3*in_near[0] + in_far[0];
2045 out[0] = RJPEG_DIV4(t1+2);
2046
2047 for (i = 1; i < w; ++i)
2048 {
2049 t0 = t1;
2050 t1 = 3*in_near[i]+in_far[i];
2051 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2052 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2053 }
2054 out[w*2-1] = RJPEG_DIV4(t1+2);
2055
2056 (void)hs;
2057
2058 return out;
2059 }
2060
2061 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg_resample_row_hv_2_simd(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2062 static uint8_t *rjpeg_resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near,
2063 uint8_t *in_far, int w, int hs)
2064 {
2065 /* need to generate 2x2 samples for every one in input */
2066 int i = 0,t0,t1;
2067
2068 if (w == 1)
2069 {
2070 out[0] = out[1] = RJPEG_DIV4(3*in_near[0] + in_far[0] + 2);
2071 return out;
2072 }
2073
2074 t1 = 3*in_near[0] + in_far[0];
2075 /* process groups of 8 pixels for as long as we can.
2076 * note we can't handle the last pixel in a row in this loop
2077 * because we need to handle the filter boundary conditions.
2078 */
2079 for (; i < ((w-1) & ~7); i += 8)
2080 {
2081 #if defined(__SSE2__)
2082 /* load and perform the vertical filtering pass
2083 * this uses 3*x + y = 4*x + (y - x) */
2084 __m128i zero = _mm_setzero_si128();
2085 __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
2086 __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
2087 __m128i farw = _mm_unpacklo_epi8(farb, zero);
2088 __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
2089 __m128i diff = _mm_sub_epi16(farw, nearw);
2090 __m128i nears = _mm_slli_epi16(nearw, 2);
2091 __m128i curr = _mm_add_epi16(nears, diff); /* current row */
2092
2093 /* horizontal filter works the same based on shifted vers of current
2094 * row. "prev" is current row shifted right by 1 pixel; we need to
2095 * insert the previous pixel value (from t1).
2096 * "next" is current row shifted left by 1 pixel, with first pixel
2097 * of next block of 8 pixels added in.
2098 */
2099 __m128i prv0 = _mm_slli_si128(curr, 2);
2100 __m128i nxt0 = _mm_srli_si128(curr, 2);
2101 __m128i prev = _mm_insert_epi16(prv0, t1, 0);
2102 __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
2103
2104 /* horizontal filter, polyphase implementation since it's convenient:
2105 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2106 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2107 * note the shared term. */
2108 __m128i bias = _mm_set1_epi16(8);
2109 __m128i curs = _mm_slli_epi16(curr, 2);
2110 __m128i prvd = _mm_sub_epi16(prev, curr);
2111 __m128i nxtd = _mm_sub_epi16(next, curr);
2112 __m128i curb = _mm_add_epi16(curs, bias);
2113 __m128i even = _mm_add_epi16(prvd, curb);
2114 __m128i odd = _mm_add_epi16(nxtd, curb);
2115
2116 /* interleave even and odd pixels, then undo scaling. */
2117 __m128i int0 = _mm_unpacklo_epi16(even, odd);
2118 __m128i int1 = _mm_unpackhi_epi16(even, odd);
2119 __m128i de0 = _mm_srli_epi16(int0, 4);
2120 __m128i de1 = _mm_srli_epi16(int1, 4);
2121
2122 /* pack and write output */
2123 __m128i outv = _mm_packus_epi16(de0, de1);
2124 _mm_storeu_si128((__m128i *) (out + i*2), outv);
2125 #elif defined(RJPEG_NEON)
2126 /* load and perform the vertical filtering pass
2127 * this uses 3*x + y = 4*x + (y - x) */
2128 uint8x8_t farb = vld1_u8(in_far + i);
2129 uint8x8_t nearb = vld1_u8(in_near + i);
2130 int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
2131 int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
2132 int16x8_t curr = vaddq_s16(nears, diff); /* current row */
2133
2134 /* horizontal filter works the same based on shifted vers of current
2135 * row. "prev" is current row shifted right by 1 pixel; we need to
2136 * insert the previous pixel value (from t1).
2137 * "next" is current row shifted left by 1 pixel, with first pixel
2138 * of next block of 8 pixels added in. */
2139 int16x8_t prv0 = vextq_s16(curr, curr, 7);
2140 int16x8_t nxt0 = vextq_s16(curr, curr, 1);
2141 int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
2142 int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
2143
2144 /* horizontal filter, polyphase implementation since it's convenient:
2145 * even pixels = 3*cur + prev = cur*4 + (prev - cur)
2146 * odd pixels = 3*cur + next = cur*4 + (next - cur)
2147 * note the shared term.
2148 */
2149 int16x8_t curs = vshlq_n_s16(curr, 2);
2150 int16x8_t prvd = vsubq_s16(prev, curr);
2151 int16x8_t nxtd = vsubq_s16(next, curr);
2152 int16x8_t even = vaddq_s16(curs, prvd);
2153 int16x8_t odd = vaddq_s16(curs, nxtd);
2154
2155 /* undo scaling and round, then store with even/odd phases interleaved */
2156 uint8x8x2_t o;
2157 o.val[0] = vqrshrun_n_s16(even, 4);
2158 o.val[1] = vqrshrun_n_s16(odd, 4);
2159 vst2_u8(out + i*2, o);
2160 #endif
2161
2162 /* "previous" value for next iteration */
2163 t1 = 3*in_near[i+7] + in_far[i+7];
2164 }
2165
2166 t0 = t1;
2167 t1 = 3*in_near[i] + in_far[i];
2168 out[i*2] = RJPEG_DIV16(3*t1 + t0 + 8);
2169
2170 for (++i; i < w; ++i)
2171 {
2172 t0 = t1;
2173 t1 = 3*in_near[i]+in_far[i];
2174 out[i*2-1] = RJPEG_DIV16(3*t0 + t1 + 8);
2175 out[i*2 ] = RJPEG_DIV16(3*t1 + t0 + 8);
2176 }
2177 out[w*2-1] = RJPEG_DIV4(t1+2);
2178
2179 (void)hs;
2180
2181 return out;
2182 }
2183 #endif
2184
rjpeg_resample_row_generic(uint8_t * out,uint8_t * in_near,uint8_t * in_far,int w,int hs)2185 static uint8_t *rjpeg_resample_row_generic(uint8_t *out,
2186 uint8_t *in_near, uint8_t *in_far, int w, int hs)
2187 {
2188 /* resample with nearest-neighbor */
2189 int i,j;
2190 (void)in_far;
2191
2192 for (i = 0; i < w; ++i)
2193 for (j = 0; j < hs; ++j)
2194 out[i*hs+j] = in_near[i];
2195 return out;
2196 }
2197
2198 /* this is a reduced-precision calculation of YCbCr-to-RGB introduced
2199 * to make sure the code produces the same results in both SIMD and scalar */
2200 #ifndef FLOAT2FIXED
2201 #define FLOAT2FIXED(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
2202 #endif
2203
rjpeg_YCbCr_to_RGB_row(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2204 static void rjpeg_YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y,
2205 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2206 {
2207 int i;
2208 for (i = 0; i < count; ++i)
2209 {
2210 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2211 int cr = pcr[i] - 128;
2212 int cb = pcb[i] - 128;
2213 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2214 int g = y_fixed + (cr*-FLOAT2FIXED(0.71414f)) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2215 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2216 r >>= 20;
2217 g >>= 20;
2218 b >>= 20;
2219 if ((unsigned) r > 255)
2220 r = 255;
2221 if ((unsigned) g > 255)
2222 g = 255;
2223 if ((unsigned) b > 255)
2224 b = 255;
2225 out[0] = (uint8_t)r;
2226 out[1] = (uint8_t)g;
2227 out[2] = (uint8_t)b;
2228 out[3] = 255;
2229 out += step;
2230 }
2231 }
2232
2233 #if defined(__SSE2__) || defined(RJPEG_NEON)
rjpeg_YCbCr_to_RGB_simd(uint8_t * out,const uint8_t * y,const uint8_t * pcb,const uint8_t * pcr,int count,int step)2234 static void rjpeg_YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y,
2235 const uint8_t *pcb, const uint8_t *pcr, int count, int step)
2236 {
2237 int i = 0;
2238
2239 #if defined(__SSE2__)
2240 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced
2241 * it's useful in practice (you wouldn't use it for textures, for example).
2242 * so just accelerate step == 4 case.
2243 */
2244 if (step == 4)
2245 {
2246 /* this is a fairly straightforward implementation and not super-optimized. */
2247 __m128i signflip = _mm_set1_epi8(-0x80);
2248 __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
2249 __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
2250 __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
2251 __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
2252 __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
2253 __m128i xw = _mm_set1_epi16(255); /* alpha channel */
2254
2255 for (; i+7 < count; i += 8)
2256 {
2257 /* load */
2258 __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
2259 __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
2260 __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
2261 __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); /* -128 */
2262 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); /* -128 */
2263
2264 /* unpack to short (and left-shift cr, cb by 8) */
2265 __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
2266 __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
2267 __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
2268
2269 /* color transform */
2270 __m128i yws = _mm_srli_epi16(yw, 4);
2271 __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
2272 __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
2273 __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
2274 __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
2275 __m128i rws = _mm_add_epi16(cr0, yws);
2276 __m128i gwt = _mm_add_epi16(cb0, yws);
2277 __m128i bws = _mm_add_epi16(yws, cb1);
2278 __m128i gws = _mm_add_epi16(gwt, cr1);
2279
2280 /* descale */
2281 __m128i rw = _mm_srai_epi16(rws, 4);
2282 __m128i bw = _mm_srai_epi16(bws, 4);
2283 __m128i gw = _mm_srai_epi16(gws, 4);
2284
2285 /* back to byte, set up for transpose */
2286 __m128i brb = _mm_packus_epi16(rw, bw);
2287 __m128i gxb = _mm_packus_epi16(gw, xw);
2288
2289 /* transpose to interleave channels */
2290 __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
2291 __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
2292 __m128i o0 = _mm_unpacklo_epi16(t0, t1);
2293 __m128i o1 = _mm_unpackhi_epi16(t0, t1);
2294
2295 /* store */
2296 _mm_storeu_si128((__m128i *) (out + 0), o0);
2297 _mm_storeu_si128((__m128i *) (out + 16), o1);
2298 out += 32;
2299 }
2300 }
2301 #endif
2302
2303 #ifdef RJPEG_NEON
2304 /* in this version, step=3 support would be easy to add. but is there demand? */
2305 if (step == 4)
2306 {
2307 /* this is a fairly straightforward implementation and not super-optimized. */
2308 uint8x8_t signflip = vdup_n_u8(0x80);
2309 int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
2310 int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
2311 int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
2312 int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
2313
2314 for (; i+7 < count; i += 8)
2315 {
2316 uint8x8x4_t o;
2317
2318 /* load */
2319 uint8x8_t y_bytes = vld1_u8(y + i);
2320 uint8x8_t cr_bytes = vld1_u8(pcr + i);
2321 uint8x8_t cb_bytes = vld1_u8(pcb + i);
2322 int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
2323 int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
2324
2325 /* expand to s16 */
2326 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
2327 int16x8_t crw = vshll_n_s8(cr_biased, 7);
2328 int16x8_t cbw = vshll_n_s8(cb_biased, 7);
2329
2330 /* color transform */
2331 int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
2332 int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
2333 int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
2334 int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
2335 int16x8_t rws = vaddq_s16(yws, cr0);
2336 int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
2337 int16x8_t bws = vaddq_s16(yws, cb1);
2338
2339 /* undo scaling, round, convert to byte */
2340 o.val[0] = vqrshrun_n_s16(rws, 4);
2341 o.val[1] = vqrshrun_n_s16(gws, 4);
2342 o.val[2] = vqrshrun_n_s16(bws, 4);
2343 o.val[3] = vdup_n_u8(255);
2344
2345 /* store, interleaving r/g/b/a */
2346 vst4_u8(out, o);
2347 out += 8*4;
2348 }
2349 }
2350 #endif
2351
2352 for (; i < count; ++i)
2353 {
2354 int y_fixed = (y[i] << 20) + (1<<19); /* rounding */
2355 int cr = pcr[i] - 128;
2356 int cb = pcb[i] - 128;
2357 int r = y_fixed + cr* FLOAT2FIXED(1.40200f);
2358 int g = y_fixed + cr*-FLOAT2FIXED(0.71414f) + ((cb*-FLOAT2FIXED(0.34414f)) & 0xffff0000);
2359 int b = y_fixed + cb* FLOAT2FIXED(1.77200f);
2360 r >>= 20;
2361 g >>= 20;
2362 b >>= 20;
2363 if ((unsigned) r > 255)
2364 r = 255;
2365 if ((unsigned) g > 255)
2366 g = 255;
2367 if ((unsigned) b > 255)
2368 b = 255;
2369 out[0] = (uint8_t)r;
2370 out[1] = (uint8_t)g;
2371 out[2] = (uint8_t)b;
2372 out[3] = 255;
2373 out += step;
2374 }
2375 }
2376 #endif
2377
2378 /* set up the kernels */
rjpeg_setup_jpeg(rjpeg_jpeg * j)2379 static void rjpeg_setup_jpeg(rjpeg_jpeg *j)
2380 {
2381 uint64_t mask = cpu_features_get();
2382
2383 (void)mask;
2384
2385 j->idct_block_kernel = rjpeg_idct_block;
2386 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_row;
2387 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2;
2388
2389 #if defined(__SSE2__)
2390 if (mask & RETRO_SIMD_SSE2)
2391 {
2392 j->idct_block_kernel = rjpeg_idct_simd;
2393 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2394 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2395 }
2396 #endif
2397
2398 #ifdef RJPEG_NEON
2399 j->idct_block_kernel = rjpeg_idct_simd;
2400 j->YCbCr_to_RGB_kernel = rjpeg_YCbCr_to_RGB_simd;
2401 j->resample_row_hv_2_kernel = rjpeg_resample_row_hv_2_simd;
2402 #endif
2403 }
2404
2405 /* clean up the temporary component buffers */
rjpeg_cleanup_jpeg(rjpeg_jpeg * j)2406 static void rjpeg_cleanup_jpeg(rjpeg_jpeg *j)
2407 {
2408 int i;
2409 for (i = 0; i < j->s->img_n; ++i)
2410 {
2411 if (j->img_comp[i].raw_data)
2412 {
2413 free(j->img_comp[i].raw_data);
2414 j->img_comp[i].raw_data = NULL;
2415 j->img_comp[i].data = NULL;
2416 }
2417
2418 if (j->img_comp[i].raw_coeff)
2419 {
2420 free(j->img_comp[i].raw_coeff);
2421 j->img_comp[i].raw_coeff = 0;
2422 j->img_comp[i].coeff = 0;
2423 }
2424
2425 if (j->img_comp[i].linebuf)
2426 {
2427 free(j->img_comp[i].linebuf);
2428 j->img_comp[i].linebuf = NULL;
2429 }
2430 }
2431 }
2432
rjpeg_load_jpeg_image(rjpeg_jpeg * z,unsigned * out_x,unsigned * out_y,int * comp,int req_comp)2433 static uint8_t *rjpeg_load_jpeg_image(rjpeg_jpeg *z,
2434 unsigned *out_x, unsigned *out_y, int *comp, int req_comp)
2435 {
2436 int n, decode_n;
2437 int k;
2438 unsigned int i,j;
2439 rjpeg_resample res_comp[4];
2440 uint8_t *coutput[4] = {0};
2441 uint8_t *output = NULL;
2442 z->s->img_n = 0;
2443
2444 /* load a jpeg image from whichever source, but leave in YCbCr format */
2445 if (!rjpeg_decode_jpeg_image(z))
2446 goto error;
2447
2448 /* determine actual number of components to generate */
2449 n = req_comp ? req_comp : z->s->img_n;
2450
2451 if (z->s->img_n == 3 && n < 3)
2452 decode_n = 1;
2453 else
2454 decode_n = z->s->img_n;
2455
2456 /* resample and color-convert */
2457 for (k = 0; k < decode_n; ++k)
2458 {
2459 rjpeg_resample *r = &res_comp[k];
2460
2461 /* allocate line buffer big enough for upsampling off the edges
2462 * with upsample factor of 4 */
2463 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3);
2464 if (!z->img_comp[k].linebuf)
2465 goto error;
2466
2467 r->hs = z->img_h_max / z->img_comp[k].h;
2468 r->vs = z->img_v_max / z->img_comp[k].v;
2469 r->ystep = r->vs >> 1;
2470 r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
2471 r->ypos = 0;
2472 r->line0 = r->line1 = z->img_comp[k].data;
2473 r->resample = rjpeg_resample_row_generic;
2474
2475 if (r->hs == 1 && r->vs == 1)
2476 r->resample = rjpeg_resample_row_1;
2477 else if (r->hs == 1 && r->vs == 2)
2478 r->resample = rjpeg_resample_row_v_2;
2479 else if (r->hs == 2 && r->vs == 1)
2480 r->resample = rjpeg_resample_row_h_2;
2481 else if (r->hs == 2 && r->vs == 2)
2482 r->resample = z->resample_row_hv_2_kernel;
2483 }
2484
2485 /* can't error after this so, this is safe */
2486 output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1);
2487
2488 if (!output)
2489 goto error;
2490
2491 /* now go ahead and resample */
2492 for (j = 0; j < z->s->img_y; ++j)
2493 {
2494 uint8_t *out = output + n * z->s->img_x * j;
2495 for (k = 0; k < decode_n; ++k)
2496 {
2497 rjpeg_resample *r = &res_comp[k];
2498 int y_bot = r->ystep >= (r->vs >> 1);
2499
2500 coutput[k] = r->resample(z->img_comp[k].linebuf,
2501 y_bot ? r->line1 : r->line0,
2502 y_bot ? r->line0 : r->line1,
2503 r->w_lores, r->hs);
2504
2505 if (++r->ystep >= r->vs)
2506 {
2507 r->ystep = 0;
2508 r->line0 = r->line1;
2509 if (++r->ypos < z->img_comp[k].y)
2510 r->line1 += z->img_comp[k].w2;
2511 }
2512 }
2513
2514 if (n >= 3)
2515 {
2516 uint8_t *y = coutput[0];
2517 if (y)
2518 {
2519 if (z->s->img_n == 3)
2520 z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
2521 else
2522 for (i = 0; i < z->s->img_x; ++i)
2523 {
2524 out[0] = out[1] = out[2] = y[i];
2525 out[3] = 255; /* not used if n==3 */
2526 out += n;
2527 }
2528 }
2529 }
2530 else
2531 {
2532 uint8_t *y = coutput[0];
2533 if (n == 1)
2534 for (i = 0; i < z->s->img_x; ++i)
2535 out[i] = y[i];
2536 else
2537 for (i = 0; i < z->s->img_x; ++i)
2538 {
2539 *out++ = y[i];
2540 *out++ = 255;
2541 }
2542 }
2543 }
2544
2545 rjpeg_cleanup_jpeg(z);
2546 *out_x = z->s->img_x;
2547 *out_y = z->s->img_y;
2548
2549 if (comp)
2550 *comp = z->s->img_n; /* report original components, not output */
2551 return output;
2552
2553 error:
2554 rjpeg_cleanup_jpeg(z);
2555 return NULL;
2556 }
2557
rjpeg_process_image(rjpeg_t * rjpeg,void ** buf_data,size_t size,unsigned * width,unsigned * height)2558 int rjpeg_process_image(rjpeg_t *rjpeg, void **buf_data,
2559 size_t size, unsigned *width, unsigned *height)
2560 {
2561 rjpeg_jpeg j;
2562 rjpeg_context s;
2563 int comp;
2564 uint32_t *img = NULL;
2565 uint32_t *pixels = NULL;
2566 unsigned size_tex = 0;
2567
2568 if (!rjpeg)
2569 return IMAGE_PROCESS_ERROR;
2570
2571 s.img_buffer = (uint8_t*)rjpeg->buff_data;
2572 s.img_buffer_original = (uint8_t*)rjpeg->buff_data;
2573 s.img_buffer_end = (uint8_t*)rjpeg->buff_data + (int)size;
2574
2575 j.s = &s;
2576
2577 rjpeg_setup_jpeg(&j);
2578
2579 img = (uint32_t*)rjpeg_load_jpeg_image(&j, width, height, &comp, 4);
2580
2581 if (!img)
2582 return IMAGE_PROCESS_ERROR;
2583
2584 size_tex = (*width) * (*height);
2585 pixels = (uint32_t*)malloc(size_tex * sizeof(uint32_t));
2586
2587 if (!pixels)
2588 {
2589 free(img);
2590 return IMAGE_PROCESS_ERROR;
2591 }
2592
2593 *buf_data = pixels;
2594
2595 /* Convert RGBA to ARGB */
2596 while (size_tex--)
2597 {
2598 unsigned int texel = img[size_tex];
2599 unsigned int A = texel & 0xFF000000;
2600 unsigned int B = texel & 0x00FF0000;
2601 unsigned int G = texel & 0x0000FF00;
2602 unsigned int R = texel & 0x000000FF;
2603 ((unsigned int*)pixels)[size_tex] = A | (R << 16) | G | (B >> 16);
2604 }
2605
2606 free(img);
2607
2608 return IMAGE_PROCESS_END;
2609 }
2610
rjpeg_set_buf_ptr(rjpeg_t * rjpeg,void * data)2611 bool rjpeg_set_buf_ptr(rjpeg_t *rjpeg, void *data)
2612 {
2613 if (!rjpeg)
2614 return false;
2615
2616 rjpeg->buff_data = (uint8_t*)data;
2617
2618 return true;
2619 }
2620
rjpeg_free(rjpeg_t * rjpeg)2621 void rjpeg_free(rjpeg_t *rjpeg)
2622 {
2623 if (!rjpeg)
2624 return;
2625
2626 free(rjpeg);
2627 }
2628
rjpeg_alloc(void)2629 rjpeg_t *rjpeg_alloc(void)
2630 {
2631 rjpeg_t *rjpeg = (rjpeg_t*)calloc(1, sizeof(*rjpeg));
2632 if (!rjpeg)
2633 return NULL;
2634 return rjpeg;
2635 }
2636