1 /*
2  * jcsample-neon.c - downsampling (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
29 #include "../jsimd.h"
30 #include "align.h"
31 
32 #include <arm_neon.h>
33 
34 
35 ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
36   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
37   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
38   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
39   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
40   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
41   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
42   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
43   0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
44   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
45   0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
46   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
47   0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
48   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
49   0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
50   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
51   0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
52   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
53   0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
54   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
55   0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
56   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
57   0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
58   0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
59   0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
60   0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
61   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
62   0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
63   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
64   0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
65   0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
66   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
67   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
68 };
69 
70 
71 /* Downsample pixel values of a single component.
72  * This version handles the common case of 2:1 horizontal and 1:1 vertical,
73  * without smoothing.
74  */
75 
jsimd_h2v1_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)76 void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
77                                 JDIMENSION v_samp_factor,
78                                 JDIMENSION width_in_blocks,
79                                 JSAMPARRAY input_data, JSAMPARRAY output_data)
80 {
81   JSAMPROW inptr, outptr;
82   /* Load expansion mask to pad remaining elements of last DCT block. */
83   const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
84   const uint8x16_t expand_mask =
85     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
86   /* Load bias pattern (alternating every pixel.) */
87   /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
88   const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
89   unsigned i, outrow;
90 
91   for (outrow = 0; outrow < v_samp_factor; outrow++) {
92     outptr = output_data[outrow];
93     inptr = input_data[outrow];
94 
95     /* Downsample all but the last DCT block of pixels. */
96     for (i = 0; i < width_in_blocks - 1; i++) {
97       uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
98       /* Add adjacent pixel values, widen to 16-bit, and add bias. */
99       uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
100       /* Divide total by 2 and narrow to 8-bit. */
101       uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
102       /* Store samples to memory. */
103       vst1_u8(outptr + i * DCTSIZE, samples_u8);
104     }
105 
106     /* Load pixels in last DCT block into a table. */
107     uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
108 #if defined(__aarch64__) || defined(_M_ARM64)
109     /* Pad the empty elements with the value of the last pixel. */
110     pixels = vqtbl1q_u8(pixels, expand_mask);
111 #else
112     uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
113     pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
114                          vtbl2_u8(table, vget_high_u8(expand_mask)));
115 #endif
116     /* Add adjacent pixel values, widen to 16-bit, and add bias. */
117     uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
118     /* Divide total by 2, narrow to 8-bit, and store. */
119     uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
120     vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
121   }
122 }
123 
124 
125 /* Downsample pixel values of a single component.
126  * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
127  * without smoothing.
128  */
129 
jsimd_h2v2_downsample_neon(JDIMENSION image_width,int max_v_samp_factor,JDIMENSION v_samp_factor,JDIMENSION width_in_blocks,JSAMPARRAY input_data,JSAMPARRAY output_data)130 void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
131                                 JDIMENSION v_samp_factor,
132                                 JDIMENSION width_in_blocks,
133                                 JSAMPARRAY input_data, JSAMPARRAY output_data)
134 {
135   JSAMPROW inptr0, inptr1, outptr;
136   /* Load expansion mask to pad remaining elements of last DCT block. */
137   const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
138   const uint8x16_t expand_mask =
139     vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
140   /* Load bias pattern (alternating every pixel.) */
141   /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
142   const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
143   unsigned i, outrow;
144 
145   for (outrow = 0; outrow < v_samp_factor; outrow++) {
146     outptr = output_data[outrow];
147     inptr0 = input_data[outrow];
148     inptr1 = input_data[outrow + 1];
149 
150     /* Downsample all but the last DCT block of pixels. */
151     for (i = 0; i < width_in_blocks - 1; i++) {
152       uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
153       uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
154       /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
155       uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
156       /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
157        */
158       samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
159       /* Divide total by 4 and narrow to 8-bit. */
160       uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
161       /* Store samples to memory and increment pointers. */
162       vst1_u8(outptr + i * DCTSIZE, samples_u8);
163     }
164 
165     /* Load pixels in last DCT block into a table. */
166     uint8x16_t pixels_r0 =
167       vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
168     uint8x16_t pixels_r1 =
169       vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
170 #if defined(__aarch64__) || defined(_M_ARM64)
171     /* Pad the empty elements with the value of the last pixel. */
172     pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
173     pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
174 #else
175     uint8x8x2_t table_r0 =
176       { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
177     uint8x8x2_t table_r1 =
178       { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
179     pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
180                             vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
181     pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
182                             vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
183 #endif
184     /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
185     uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
186     /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
187     samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
188     /* Divide total by 4, narrow to 8-bit, and store. */
189     uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
190     vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
191   }
192 }
193