1 /*
2 * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 /* This file is included by jdmerge-neon.c. */
25
26
27 /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
28 * chroma upsampling and YCbCr -> RGB color conversion into a single function.
29 *
30 * As with the standalone functions, YCbCr -> RGB conversion is defined by the
31 * following equations:
32 * R = Y + 1.40200 * (Cr - 128)
33 * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
34 * B = Y + 1.77200 * (Cb - 128)
35 *
36 * Scaled integer constants are used to avoid floating-point arithmetic:
37 * 0.3441467 = 11277 * 2^-15
38 * 0.7141418 = 23401 * 2^-15
39 * 1.4020386 = 22971 * 2^-14
40 * 1.7720337 = 29033 * 2^-14
41 * These constants are defined in jdmerge-neon.c.
42 *
43 * To ensure correct results, rounding is used when descaling.
44 */
45
46 /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
47 * routines:
48 *
49 * Input memory buffers can be safely overread up to the next multiple of
50 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
51 * jmemmgr.c.
52 *
53 * The output buffer cannot safely be written beyond output_width, since
54 * output_buf points to a possibly unpadded row in the decompressed image
55 * buffer allocated by the calling program.
56 */
57
58 /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
59 */
60
jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)61 void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
62 JSAMPIMAGE input_buf,
63 JDIMENSION in_row_group_ctr,
64 JSAMPARRAY output_buf)
65 {
66 JSAMPROW outptr;
67 /* Pointers to Y, Cb, and Cr data */
68 JSAMPROW inptr0, inptr1, inptr2;
69
70 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
71 const int16x8_t neg_128 = vdupq_n_s16(-128);
72
73 inptr0 = input_buf[0][in_row_group_ctr];
74 inptr1 = input_buf[1][in_row_group_ctr];
75 inptr2 = input_buf[2][in_row_group_ctr];
76 outptr = output_buf[0];
77
78 int cols_remaining = output_width;
79 for (; cols_remaining >= 16; cols_remaining -= 16) {
80 /* De-interleave Y component values into two separate vectors, one
81 * containing the component values with even-numbered indices and one
82 * containing the component values with odd-numbered indices.
83 */
84 uint8x8x2_t y = vld2_u8(inptr0);
85 uint8x8_t cb = vld1_u8(inptr1);
86 uint8x8_t cr = vld1_u8(inptr2);
87 /* Subtract 128 from Cb and Cr. */
88 int16x8_t cr_128 =
89 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
90 int16x8_t cb_128 =
91 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
92 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
93 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
94 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
95 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
96 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
97 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
98 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
99 vrshrn_n_s32(g_sub_y_h, 15));
100 /* Compute R-Y: 1.40200 * (Cr - 128) */
101 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
102 /* Compute B-Y: 1.77200 * (Cb - 128) */
103 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
104 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
105 * "odd" Y component values. This effectively upsamples the chroma
106 * components horizontally.
107 */
108 int16x8_t g_even =
109 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
110 y.val[0]));
111 int16x8_t r_even =
112 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
113 y.val[0]));
114 int16x8_t b_even =
115 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
116 y.val[0]));
117 int16x8_t g_odd =
118 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
119 y.val[1]));
120 int16x8_t r_odd =
121 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
122 y.val[1]));
123 int16x8_t b_odd =
124 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
125 y.val[1]));
126 /* Convert each component to unsigned and narrow, clamping to [0-255].
127 * Re-interleave the "even" and "odd" component values.
128 */
129 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
130 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
131 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
132
133 #ifdef RGB_ALPHA
134 uint8x16x4_t rgba;
135 rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
136 rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
137 rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
138 /* Set alpha channel to opaque (0xFF). */
139 rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
140 /* Store RGBA pixel data to memory. */
141 vst4q_u8(outptr, rgba);
142 #else
143 uint8x16x3_t rgb;
144 rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
145 rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
146 rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
147 /* Store RGB pixel data to memory. */
148 vst3q_u8(outptr, rgb);
149 #endif
150
151 /* Increment pointers. */
152 inptr0 += 16;
153 inptr1 += 8;
154 inptr2 += 8;
155 outptr += (RGB_PIXELSIZE * 16);
156 }
157
158 if (cols_remaining > 0) {
159 /* De-interleave Y component values into two separate vectors, one
160 * containing the component values with even-numbered indices and one
161 * containing the component values with odd-numbered indices.
162 */
163 uint8x8x2_t y = vld2_u8(inptr0);
164 uint8x8_t cb = vld1_u8(inptr1);
165 uint8x8_t cr = vld1_u8(inptr2);
166 /* Subtract 128 from Cb and Cr. */
167 int16x8_t cr_128 =
168 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
169 int16x8_t cb_128 =
170 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
171 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
172 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
173 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
174 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
175 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
176 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
177 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
178 vrshrn_n_s32(g_sub_y_h, 15));
179 /* Compute R-Y: 1.40200 * (Cr - 128) */
180 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
181 /* Compute B-Y: 1.77200 * (Cb - 128) */
182 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
183 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
184 * "odd" Y component values. This effectively upsamples the chroma
185 * components horizontally.
186 */
187 int16x8_t g_even =
188 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
189 y.val[0]));
190 int16x8_t r_even =
191 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
192 y.val[0]));
193 int16x8_t b_even =
194 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
195 y.val[0]));
196 int16x8_t g_odd =
197 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
198 y.val[1]));
199 int16x8_t r_odd =
200 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
201 y.val[1]));
202 int16x8_t b_odd =
203 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
204 y.val[1]));
205 /* Convert each component to unsigned and narrow, clamping to [0-255].
206 * Re-interleave the "even" and "odd" component values.
207 */
208 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
209 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
210 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
211
212 #ifdef RGB_ALPHA
213 uint8x8x4_t rgba_h;
214 rgba_h.val[RGB_RED] = r.val[1];
215 rgba_h.val[RGB_GREEN] = g.val[1];
216 rgba_h.val[RGB_BLUE] = b.val[1];
217 /* Set alpha channel to opaque (0xFF). */
218 rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
219 uint8x8x4_t rgba_l;
220 rgba_l.val[RGB_RED] = r.val[0];
221 rgba_l.val[RGB_GREEN] = g.val[0];
222 rgba_l.val[RGB_BLUE] = b.val[0];
223 /* Set alpha channel to opaque (0xFF). */
224 rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
225 /* Store RGBA pixel data to memory. */
226 switch (cols_remaining) {
227 case 15:
228 vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
229 case 14:
230 vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
231 case 13:
232 vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
233 case 12:
234 vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
235 case 11:
236 vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
237 case 10:
238 vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
239 case 9:
240 vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
241 case 8:
242 vst4_u8(outptr, rgba_l);
243 break;
244 case 7:
245 vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
246 case 6:
247 vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
248 case 5:
249 vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
250 case 4:
251 vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
252 case 3:
253 vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
254 case 2:
255 vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
256 case 1:
257 vst4_lane_u8(outptr, rgba_l, 0);
258 default:
259 break;
260 }
261 #else
262 uint8x8x3_t rgb_h;
263 rgb_h.val[RGB_RED] = r.val[1];
264 rgb_h.val[RGB_GREEN] = g.val[1];
265 rgb_h.val[RGB_BLUE] = b.val[1];
266 uint8x8x3_t rgb_l;
267 rgb_l.val[RGB_RED] = r.val[0];
268 rgb_l.val[RGB_GREEN] = g.val[0];
269 rgb_l.val[RGB_BLUE] = b.val[0];
270 /* Store RGB pixel data to memory. */
271 switch (cols_remaining) {
272 case 15:
273 vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
274 case 14:
275 vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
276 case 13:
277 vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
278 case 12:
279 vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
280 case 11:
281 vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
282 case 10:
283 vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
284 case 9:
285 vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
286 case 8:
287 vst3_u8(outptr, rgb_l);
288 break;
289 case 7:
290 vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
291 case 6:
292 vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
293 case 5:
294 vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
295 case 4:
296 vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
297 case 3:
298 vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
299 case 2:
300 vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
301 case 1:
302 vst3_lane_u8(outptr, rgb_l, 0);
303 default:
304 break;
305 }
306 #endif
307 }
308 }
309
310
311 /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
312 *
313 * See comments above for details regarding color conversion and safe memory
314 * access.
315 */
316
jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)317 void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
318 JSAMPIMAGE input_buf,
319 JDIMENSION in_row_group_ctr,
320 JSAMPARRAY output_buf)
321 {
322 JSAMPROW outptr0, outptr1;
323 /* Pointers to Y (both rows), Cb, and Cr data */
324 JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
325
326 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
327 const int16x8_t neg_128 = vdupq_n_s16(-128);
328
329 inptr0_0 = input_buf[0][in_row_group_ctr * 2];
330 inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
331 inptr1 = input_buf[1][in_row_group_ctr];
332 inptr2 = input_buf[2][in_row_group_ctr];
333 outptr0 = output_buf[0];
334 outptr1 = output_buf[1];
335
336 int cols_remaining = output_width;
337 for (; cols_remaining >= 16; cols_remaining -= 16) {
338 /* For each row, de-interleave Y component values into two separate
339 * vectors, one containing the component values with even-numbered indices
340 * and one containing the component values with odd-numbered indices.
341 */
342 uint8x8x2_t y0 = vld2_u8(inptr0_0);
343 uint8x8x2_t y1 = vld2_u8(inptr0_1);
344 uint8x8_t cb = vld1_u8(inptr1);
345 uint8x8_t cr = vld1_u8(inptr2);
346 /* Subtract 128 from Cb and Cr. */
347 int16x8_t cr_128 =
348 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
349 int16x8_t cb_128 =
350 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
351 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
352 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
353 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
354 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
355 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
356 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
357 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
358 vrshrn_n_s32(g_sub_y_h, 15));
359 /* Compute R-Y: 1.40200 * (Cr - 128) */
360 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
361 /* Compute B-Y: 1.77200 * (Cb - 128) */
362 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
363 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
364 * the "even" and "odd" Y component values. This effectively upsamples the
365 * chroma components both horizontally and vertically.
366 */
367 int16x8_t g0_even =
368 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
369 y0.val[0]));
370 int16x8_t r0_even =
371 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
372 y0.val[0]));
373 int16x8_t b0_even =
374 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
375 y0.val[0]));
376 int16x8_t g0_odd =
377 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
378 y0.val[1]));
379 int16x8_t r0_odd =
380 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
381 y0.val[1]));
382 int16x8_t b0_odd =
383 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
384 y0.val[1]));
385 int16x8_t g1_even =
386 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
387 y1.val[0]));
388 int16x8_t r1_even =
389 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
390 y1.val[0]));
391 int16x8_t b1_even =
392 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
393 y1.val[0]));
394 int16x8_t g1_odd =
395 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
396 y1.val[1]));
397 int16x8_t r1_odd =
398 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
399 y1.val[1]));
400 int16x8_t b1_odd =
401 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
402 y1.val[1]));
403 /* Convert each component to unsigned and narrow, clamping to [0-255].
404 * Re-interleave the "even" and "odd" component values.
405 */
406 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
407 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
408 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
409 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
410 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
411 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
412
413 #ifdef RGB_ALPHA
414 uint8x16x4_t rgba0, rgba1;
415 rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
416 rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
417 rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
418 rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
419 rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
420 rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
421 /* Set alpha channel to opaque (0xFF). */
422 rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
423 rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
424 /* Store RGBA pixel data to memory. */
425 vst4q_u8(outptr0, rgba0);
426 vst4q_u8(outptr1, rgba1);
427 #else
428 uint8x16x3_t rgb0, rgb1;
429 rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
430 rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
431 rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
432 rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
433 rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
434 rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
435 /* Store RGB pixel data to memory. */
436 vst3q_u8(outptr0, rgb0);
437 vst3q_u8(outptr1, rgb1);
438 #endif
439
440 /* Increment pointers. */
441 inptr0_0 += 16;
442 inptr0_1 += 16;
443 inptr1 += 8;
444 inptr2 += 8;
445 outptr0 += (RGB_PIXELSIZE * 16);
446 outptr1 += (RGB_PIXELSIZE * 16);
447 }
448
449 if (cols_remaining > 0) {
450 /* For each row, de-interleave Y component values into two separate
451 * vectors, one containing the component values with even-numbered indices
452 * and one containing the component values with odd-numbered indices.
453 */
454 uint8x8x2_t y0 = vld2_u8(inptr0_0);
455 uint8x8x2_t y1 = vld2_u8(inptr0_1);
456 uint8x8_t cb = vld1_u8(inptr1);
457 uint8x8_t cr = vld1_u8(inptr2);
458 /* Subtract 128 from Cb and Cr. */
459 int16x8_t cr_128 =
460 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
461 int16x8_t cb_128 =
462 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
463 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
464 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
465 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
466 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
467 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
468 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
469 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
470 vrshrn_n_s32(g_sub_y_h, 15));
471 /* Compute R-Y: 1.40200 * (Cr - 128) */
472 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
473 /* Compute B-Y: 1.77200 * (Cb - 128) */
474 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
475 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
476 * the "even" and "odd" Y component values. This effectively upsamples the
477 * chroma components both horizontally and vertically.
478 */
479 int16x8_t g0_even =
480 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
481 y0.val[0]));
482 int16x8_t r0_even =
483 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
484 y0.val[0]));
485 int16x8_t b0_even =
486 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
487 y0.val[0]));
488 int16x8_t g0_odd =
489 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
490 y0.val[1]));
491 int16x8_t r0_odd =
492 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
493 y0.val[1]));
494 int16x8_t b0_odd =
495 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
496 y0.val[1]));
497 int16x8_t g1_even =
498 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
499 y1.val[0]));
500 int16x8_t r1_even =
501 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
502 y1.val[0]));
503 int16x8_t b1_even =
504 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
505 y1.val[0]));
506 int16x8_t g1_odd =
507 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
508 y1.val[1]));
509 int16x8_t r1_odd =
510 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
511 y1.val[1]));
512 int16x8_t b1_odd =
513 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
514 y1.val[1]));
515 /* Convert each component to unsigned and narrow, clamping to [0-255].
516 * Re-interleave the "even" and "odd" component values.
517 */
518 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
519 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
520 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
521 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
522 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
523 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
524
525 #ifdef RGB_ALPHA
526 uint8x8x4_t rgba0_h, rgba1_h;
527 rgba0_h.val[RGB_RED] = r0.val[1];
528 rgba1_h.val[RGB_RED] = r1.val[1];
529 rgba0_h.val[RGB_GREEN] = g0.val[1];
530 rgba1_h.val[RGB_GREEN] = g1.val[1];
531 rgba0_h.val[RGB_BLUE] = b0.val[1];
532 rgba1_h.val[RGB_BLUE] = b1.val[1];
533 /* Set alpha channel to opaque (0xFF). */
534 rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
535 rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
536
537 uint8x8x4_t rgba0_l, rgba1_l;
538 rgba0_l.val[RGB_RED] = r0.val[0];
539 rgba1_l.val[RGB_RED] = r1.val[0];
540 rgba0_l.val[RGB_GREEN] = g0.val[0];
541 rgba1_l.val[RGB_GREEN] = g1.val[0];
542 rgba0_l.val[RGB_BLUE] = b0.val[0];
543 rgba1_l.val[RGB_BLUE] = b1.val[0];
544 /* Set alpha channel to opaque (0xFF). */
545 rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
546 rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
547 /* Store RGBA pixel data to memory. */
548 switch (cols_remaining) {
549 case 15:
550 vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
551 vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
552 case 14:
553 vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
554 vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
555 case 13:
556 vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
557 vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
558 case 12:
559 vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
560 vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
561 case 11:
562 vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
563 vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
564 case 10:
565 vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
566 vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
567 case 9:
568 vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
569 vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
570 case 8:
571 vst4_u8(outptr0, rgba0_l);
572 vst4_u8(outptr1, rgba1_l);
573 break;
574 case 7:
575 vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
576 vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
577 case 6:
578 vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
579 vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
580 case 5:
581 vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
582 vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
583 case 4:
584 vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
585 vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
586 case 3:
587 vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
588 vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
589 case 2:
590 vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
591 vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
592 case 1:
593 vst4_lane_u8(outptr0, rgba0_l, 0);
594 vst4_lane_u8(outptr1, rgba1_l, 0);
595 default:
596 break;
597 }
598 #else
599 uint8x8x3_t rgb0_h, rgb1_h;
600 rgb0_h.val[RGB_RED] = r0.val[1];
601 rgb1_h.val[RGB_RED] = r1.val[1];
602 rgb0_h.val[RGB_GREEN] = g0.val[1];
603 rgb1_h.val[RGB_GREEN] = g1.val[1];
604 rgb0_h.val[RGB_BLUE] = b0.val[1];
605 rgb1_h.val[RGB_BLUE] = b1.val[1];
606
607 uint8x8x3_t rgb0_l, rgb1_l;
608 rgb0_l.val[RGB_RED] = r0.val[0];
609 rgb1_l.val[RGB_RED] = r1.val[0];
610 rgb0_l.val[RGB_GREEN] = g0.val[0];
611 rgb1_l.val[RGB_GREEN] = g1.val[0];
612 rgb0_l.val[RGB_BLUE] = b0.val[0];
613 rgb1_l.val[RGB_BLUE] = b1.val[0];
614 /* Store RGB pixel data to memory. */
615 switch (cols_remaining) {
616 case 15:
617 vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
618 vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
619 case 14:
620 vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
621 vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
622 case 13:
623 vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
624 vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
625 case 12:
626 vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
627 vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
628 case 11:
629 vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
630 vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
631 case 10:
632 vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
633 vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
634 case 9:
635 vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
636 vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
637 case 8:
638 vst3_u8(outptr0, rgb0_l);
639 vst3_u8(outptr1, rgb1_l);
640 break;
641 case 7:
642 vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
643 vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
644 case 6:
645 vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
646 vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
647 case 5:
648 vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
649 vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
650 case 4:
651 vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
652 vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
653 case 3:
654 vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
655 vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
656 case 2:
657 vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
658 vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
659 case 1:
660 vst3_lane_u8(outptr0, rgb0_l, 0);
661 vst3_lane_u8(outptr1, rgb1_l, 0);
662 default:
663 break;
664 }
665 #endif
666 }
667 }
668