1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // RGB -> YUV conversion
16 //
17 // Y = ( 19595 * r + 38469 * g + 7471 * b + HALF) >> FRAC
18 // U - 128 = (-11059 * r - 21709 * g + 32768 * b + HALF) >> FRAC
19 // V - 128 = ( 32768 * r - 27439 * g - 5329 * b + HALF) >> FRAC
20 //
21 // Author: Skal (pascal.massimino@gmail.com)
22
23 #include <string.h>
24
25 #define SJPEG_NEED_ASM_HEADERS
26 #include "sjpegi.h"
27
28 namespace sjpeg {
29
30 // global fixed-point precision
31 enum { FRAC = 16, HALF = 1 << FRAC >> 1,
32 ROUND_UV = (HALF << 2), ROUND_Y = HALF - (128 << FRAC) };
33
34 #if defined(SJPEG_USE_SSE2)
35
36 // Load eight 16b-words from *src.
37 #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
38 // Store eight 16b-words into *dst
39 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
40
41 // Convert 8 packed RGB samples to r[], g[], b[]
RGB24PackedToPlanar(const uint8_t * const rgb,__m128i * const r,__m128i * const g,__m128i * const b)42 static inline void RGB24PackedToPlanar(const uint8_t* const rgb,
43 __m128i* const r,
44 __m128i* const g,
45 __m128i* const b) {
46 const __m128i zero = _mm_setzero_si128();
47 // in0: r0 g0 b0 r1 | g1 b1 r2 g2 | b2 r3 g3 b3 | r4 g4 b4 r5
48 // in1: b2 r3 g3 b3 | r4 g4 b4 r5 | g5 b5 r6 g6 | b6 r7 g7 b7
49 const __m128i in0 = LOAD_16(rgb + 0);
50 const __m128i in1 = LOAD_16(rgb + 8);
51 // A0: | r2 g2 b2 r3 | g3 b3 r4 g4 | b4 r5 ...
52 // A1: ... b2 r3 | g3 b3 r4 g4 | b4 r5 g5 b5 |
53 const __m128i A0 = _mm_srli_si128(in0, 6);
54 const __m128i A1 = _mm_slli_si128(in1, 6);
55 // B0: r0 r2 g0 g2 | b0 b2 r1 r3 | g1 g3 b1 b3 | r2 r4 b2 b4
56 // B1: g3 g5 b3 b5 | r4 r6 g4 g6 | b4 b6 r5 r7 | g5 g7 b5 b7
57 const __m128i B0 = _mm_unpacklo_epi8(in0, A0);
58 const __m128i B1 = _mm_unpackhi_epi8(A1, in1);
59 // C0: r1 r3 g1 g3 | b1 b3 r2 r4 | b2 b4 ...
60 // C1: ... g3 g5 | b3 b5 r4 r6 | g4 g6 b4 b6
61 const __m128i C0 = _mm_srli_si128(B0, 6);
62 const __m128i C1 = _mm_slli_si128(B1, 6);
63 // D0: r0 r1 r2 r3 | g0 g1 g2 g3 | b0 b1 b2 b3 | r1 r2 r3 r4
64 // D1: b3 b4 b5 b6 | r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 |
65 const __m128i D0 = _mm_unpacklo_epi8(B0, C0);
66 const __m128i D1 = _mm_unpackhi_epi8(C1, B1);
67 // r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | 0
68 const __m128i D2 = _mm_srli_si128(D1, 4);
69 // r0 r1 r2 r3 | r4 r5 r6 r7 | g0 g1 g2 g3 | g4 g5 g6 g7
70 const __m128i E0 = _mm_unpacklo_epi32(D0, D2);
71 // b0 b1 b2 b3 | b4 b5 b6 b7 | r1 r2 r3 r4 | 0
72 const __m128i E1 = _mm_unpackhi_epi32(D0, D2);
73 // g0 g1 g2 g3 | g4 g5 g6 g7 | 0
74 const __m128i E2 = _mm_srli_si128(E0, 8);
75 const __m128i F0 = _mm_unpacklo_epi8(E0, zero); // -> R
76 const __m128i F1 = _mm_unpacklo_epi8(E1, zero); // -> B
77 const __m128i F2 = _mm_unpacklo_epi8(E2, zero); // -> G
78 *r = F0;
79 *b = F1;
80 *g = F2;
81 }
82
83 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
84 // It's a macro and not a function because we need to use immediate values with
85 // srai_epi32, e.g.
86 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
87 ROUNDER, DESCALE_FIX, ADD_OR_SUB, OUT) do { \
88 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
89 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
90 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
91 const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
92 const __m128i V2_lo = ADD_OR_SUB(V0_lo, V1_lo); \
93 const __m128i V2_hi = ADD_OR_SUB(V0_hi, V1_hi); \
94 const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
95 const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
96 const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
97 const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
98 (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
99 } while (0)
100
101 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
102
ConvertRGBToY(const __m128i * const R,const __m128i * const G,const __m128i * const B,int offset,__m128i * const Y)103 inline void ConvertRGBToY(const __m128i* const R,
104 const __m128i* const G,
105 const __m128i* const B,
106 int offset,
107 __m128i* const Y) {
108 const __m128i kRG_y = MK_CST_16(19595, 38469 - 16384);
109 const __m128i kGB_y = MK_CST_16(16384, 7471);
110 const __m128i kROUND_Y = _mm_set1_epi32(HALF + (offset << FRAC));
111 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
112 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
113 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
114 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
115 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kROUND_Y, FRAC,
116 _mm_add_epi32, *Y);
117 }
118
ConvertRGBToUV(const __m128i * const R,const __m128i * const G,const __m128i * const B,int offset,__m128i * const U,__m128i * const V)119 inline void ConvertRGBToUV(const __m128i* const R,
120 const __m128i* const G,
121 const __m128i* const B,
122 int offset,
123 __m128i* const U, __m128i* const V) {
124 // Warning! 32768 is overflowing int16, so we're actually multiplying
125 // by -32768 instead of 32768. We compensate by subtracting the result
126 // instead of adding, thus restoring the sign.
127 const __m128i kRG_u = MK_CST_16(-11059, -21709);
128 const __m128i kGB_u = MK_CST_16(0, -32768);
129 const __m128i kRG_v = MK_CST_16(-32768, 0);
130 const __m128i kGB_v = MK_CST_16(-27439, -5329);
131 const __m128i kRound = _mm_set1_epi32((offset << FRAC) + HALF);
132
133 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
134 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
135 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
136 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
137
138 // _mm_sub_epi32 -> sign restore!
139 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, kRound, FRAC,
140 _mm_sub_epi32, *U);
141 // note! GB and RG are inverted, for sign-restoration
142 TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v, kRound, FRAC,
143 _mm_sub_epi32, *V);
144 }
145
146 // This version takes four accumulated R/G/B samples. Hence, the
147 // descaling factor is FRAC + 2.
ConvertRGBToUVAccumulated(const __m128i * const R,const __m128i * const G,const __m128i * const B,__m128i * const U,__m128i * const V)148 inline void ConvertRGBToUVAccumulated(const __m128i* const R,
149 const __m128i* const G,
150 const __m128i* const B,
151 __m128i* const U, __m128i* const V) {
152 // Warning! 32768 is overflowing int16, so we're actually multiplying
153 // by -32768 instead of 32768. We compensate by subtracting the result
154 // instead of adding, thus restoring the sign.
155 const __m128i kRG_u = MK_CST_16(-11059, -21709);
156 const __m128i kGB_u = MK_CST_16(0, -32768);
157 const __m128i kRG_v = MK_CST_16(-32768, 0);
158 const __m128i kGB_v = MK_CST_16(-27439, -5329);
159 const __m128i kRound = _mm_set1_epi32(ROUND_UV);
160
161 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
162 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
163 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
164 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
165
166 // _mm_sub_epi32 -> sign restore!
167 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
168 kRound, FRAC + 2, _mm_sub_epi32, *U);
169 // note! GB and RG are inverted, for sign-restoration
170 TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v,
171 kRound, FRAC + 2, _mm_sub_epi32, *V);
172 }
173
174 #undef MK_CST_16
175 #undef TRANSFORM
176
177 // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
ToYUV_8(const __m128i * const r,const __m128i * const g,const __m128i * const b,int16_t * const out)178 static inline void ToYUV_8(const __m128i* const r,
179 const __m128i* const g,
180 const __m128i* const b,
181 int16_t* const out) {
182 __m128i Y, U, V;
183 ConvertRGBToY(r, g, b, -128, &Y);
184 ConvertRGBToUV(r, g, b, 0, &U, &V);
185 STORE_16(Y, out + 0 * 64);
186 STORE_16(U, out + 1 * 64);
187 STORE_16(V, out + 2 * 64);
188 }
189
Get8x8Block_SSE2(const uint8_t * data,int step,int16_t * out)190 static void Get8x8Block_SSE2(const uint8_t* data, int step, int16_t* out) {
191 for (int y = 8; y > 0; --y) {
192 __m128i r, g, b;
193 RGB24PackedToPlanar(data, &r, &g, &b);
194 ToYUV_8(&r, &g, &b, out);
195 out += 8;
196 data += step;
197 }
198 }
199
200 // Convert 16x16 RGB samples to YUV420
ToY_16x16(const __m128i * const r,const __m128i * const g,const __m128i * const b,int16_t * const y_out,__m128i * const R_acc,__m128i * const G_acc,__m128i * const B_acc,bool do_add)201 static inline void ToY_16x16(const __m128i* const r,
202 const __m128i* const g,
203 const __m128i* const b,
204 int16_t* const y_out,
205 __m128i* const R_acc,
206 __m128i* const G_acc,
207 __m128i* const B_acc,
208 bool do_add) {
209 __m128i Y;
210 ConvertRGBToY(r, g, b, -128, &Y);
211 STORE_16(Y, y_out);
212 if (do_add) {
213 *R_acc = _mm_add_epi16(*R_acc, *r);
214 *G_acc = _mm_add_epi16(*G_acc, *g);
215 *B_acc = _mm_add_epi16(*B_acc, *b);
216 } else { // just store
217 *R_acc = *r;
218 *G_acc = *g;
219 *B_acc = *b;
220 }
221 }
222
ToUV_8x8(const __m128i * const R,const __m128i * const G,const __m128i * const B,int16_t * const uv_out)223 static inline void ToUV_8x8(const __m128i* const R,
224 const __m128i* const G,
225 const __m128i* const B,
226 int16_t* const uv_out) {
227 __m128i U, V;
228 ConvertRGBToUVAccumulated(R, G, B, &U, &V);
229 STORE_16(U, uv_out + 0 * 64);
230 STORE_16(V, uv_out + 1 * 64);
231 }
232
Condense16To8(const __m128i * const acc1,__m128i * const acc2)233 static void Condense16To8(const __m128i* const acc1, __m128i* const acc2) {
234 const __m128i one = _mm_set1_epi16(1);
235 const __m128i tmp1 = _mm_madd_epi16(*acc1, one);
236 const __m128i tmp2 = _mm_madd_epi16(*acc2, one);
237 *acc2 = _mm_packs_epi32(tmp1, tmp2);
238 }
239
240 // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
Get16x8_SSE2(const uint8_t * src1,int src_stride,int16_t y[4* 64],int16_t uv[2* 64])241 static void Get16x8_SSE2(const uint8_t* src1, int src_stride,
242 int16_t y[4 * 64], int16_t uv[2 * 64]) {
243 for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
244 __m128i r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
245 __m128i r, g, b;
246 const uint8_t* const src2 = src1 + src_stride;
247 RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
248 ToY_16x16(&r, &g, &b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, false);
249 RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
250 ToY_16x16(&r, &g, &b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, false);
251 RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
252 ToY_16x16(&r, &g, &b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, true);
253 RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
254 ToY_16x16(&r, &g, &b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, true);
255 Condense16To8(&r_acc1, &r_acc2);
256 Condense16To8(&g_acc1, &g_acc2);
257 Condense16To8(&b_acc1, &b_acc2);
258 ToUV_8x8(&r_acc2, &g_acc2, &b_acc2, uv);
259 y += 2 * 8;
260 uv += 8;
261 }
262 }
263
Get16x16Block_SSE2(const uint8_t * data,int step,int16_t * blocks)264 static void Get16x16Block_SSE2(const uint8_t* data, int step, int16_t* blocks) {
265 Get16x8_SSE2(data + 0 * step, step, blocks + 0 * 64, blocks + 4 * 64 + 0 * 8);
266 Get16x8_SSE2(data + 8 * step, step, blocks + 2 * 64, blocks + 4 * 64 + 4 * 8);
267 }
268
269 #undef LOAD_16
270 #undef STORE_16
271
272 #endif // SJPEG_USE_SSE2
273
274 ///////////////////////////////////////////////////////////////////////////////
275 // NEON-version for 8x8 and 16x16 blocks
276
277 #if defined(SJPEG_USE_NEON)
278
279 static const int16_t kCoeff1[4] = { (int16_t)38469, 19595, 7471, 0 };
280 static const int16_t kCoeff2[4] = { 21709, 11059, 27439, 5329 };
281
282 // Convert 8 packed RGB or BGR samples to r[], g[], b[]
RGB24PackedToPlanar(const uint8_t * const rgb,int16x8_t * const r,int16x8_t * const g,int16x8_t * const b)283 static void RGB24PackedToPlanar(const uint8_t* const rgb,
284 int16x8_t* const r,
285 int16x8_t* const g,
286 int16x8_t* const b) {
287 const uint8x8x3_t in = vld3_u8(rgb);
288 *r = vreinterpretq_s16_u16(vmovl_u8(in.val[0]));
289 *g = vreinterpretq_s16_u16(vmovl_u8(in.val[1]));
290 *b = vreinterpretq_s16_u16(vmovl_u8(in.val[2]));
291 }
292
293 // s16->s32 widening multiply with large (>=32768) coeff requires special care:
294 #define MULT_S32_S16_LARGE(S16, COEFF, LANE) \
295 vreinterpretq_s32_u32(vmull_lane_u16(vreinterpret_u16_s16(S16), \
296 vreinterpret_u16_s16(COEFF), (LANE)))
297
ConvertRGBToY(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const Y)298 static inline void ConvertRGBToY(const int16x8_t R,
299 const int16x8_t G,
300 const int16x8_t B,
301 const int16x4_t coeffs,
302 int16x8_t* const Y) {
303 int32x4_t lo = MULT_S32_S16_LARGE(vget_low_s16(G), coeffs, 0);
304 int32x4_t hi = MULT_S32_S16_LARGE(vget_high_s16(G), coeffs, 0);
305 lo = vmlal_lane_s16(lo, vget_low_s16(R), coeffs, 1);
306 hi = vmlal_lane_s16(hi, vget_high_s16(R), coeffs, 1);
307 lo = vmlal_lane_s16(lo, vget_low_s16(B), coeffs, 2);
308 hi = vmlal_lane_s16(hi, vget_high_s16(B), coeffs, 2);
309 const int16x8_t V0 = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, FRAC)),
310 vqmovn_s32(vrshrq_n_s32(hi, FRAC)));
311 *Y = vsubq_s16(V0, vdupq_n_s16(128));
312 }
313
314 // Compute ((V0<<15) - V1 * C1 - V2 * C2 + round) >> SHIFT
315 #define DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2) \
316 int32x4_t lo, hi; \
317 do { \
318 lo = vshll_n_s16(vget_low_s16(V0), 15); \
319 hi = vshll_n_s16(vget_high_s16(V0), 15); \
320 lo = vmlsl_lane_s16(lo, vget_low_s16(V1), COEFF, LANE1); \
321 hi = vmlsl_lane_s16(hi, vget_high_s16(V1), COEFF, LANE1); \
322 lo = vmlsl_lane_s16(lo, vget_low_s16(V2), COEFF, LANE2); \
323 hi = vmlsl_lane_s16(hi, vget_high_s16(V2), COEFF, LANE2); \
324 } while (0)
325
326 // This version assumes SHIFT <= 16
327 #define DOT_PROD1(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do { \
328 assert(SHIFT <= 16); \
329 DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2); \
330 (OUT) = vcombine_s16(vrshrn_n_s32(lo, SHIFT), vrshrn_n_s32(hi, SHIFT)); \
331 } while (0)
332
333 // alternate version for SHIFT > 16
334 #define DOT_PROD2(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do { \
335 assert(SHIFT > 16); \
336 DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2); \
337 (OUT) = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, SHIFT)), \
338 vqmovn_s32(vrshrq_n_s32(hi, SHIFT))); \
339 } while (0)
340
ConvertRGBToUV(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const U,int16x8_t * const V)341 static inline void ConvertRGBToUV(const int16x8_t R,
342 const int16x8_t G,
343 const int16x8_t B,
344 const int16x4_t coeffs,
345 int16x8_t* const U, int16x8_t* const V) {
346 DOT_PROD1(B, G, R, coeffs, 0, 1, FRAC, *U);
347 DOT_PROD1(R, G, B, coeffs, 2, 3, FRAC, *V);
348 }
349
ConvertRGBToUVAccumulated(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const U,int16x8_t * const V)350 static inline void ConvertRGBToUVAccumulated(const int16x8_t R,
351 const int16x8_t G,
352 const int16x8_t B,
353 const int16x4_t coeffs,
354 int16x8_t* const U,
355 int16x8_t* const V) {
356 DOT_PROD2(B, G, R, coeffs, 0, 1, FRAC + 2, *U);
357 DOT_PROD2(R, G, B, coeffs, 2, 3, FRAC + 2, *V);
358 }
359
360 // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
ToYUV_8(const int16x8_t r,const int16x8_t g,const int16x8_t b,const int16x4_t coeffs1,const int16x4_t coeffs2,int16_t * const out)361 static void ToYUV_8(const int16x8_t r, const int16x8_t g, const int16x8_t b,
362 const int16x4_t coeffs1, const int16x4_t coeffs2,
363 int16_t* const out) {
364 int16x8_t Y, U, V;
365 ConvertRGBToY(r, g, b, coeffs1, &Y);
366 ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
367 vst1q_s16(out + 0 * 64, Y);
368 vst1q_s16(out + 1 * 64, U);
369 vst1q_s16(out + 2 * 64, V);
370 }
371
Get8x8Block_NEON(const uint8_t * data,int step,int16_t * out)372 static void Get8x8Block_NEON(const uint8_t* data, int step, int16_t* out) {
373 const int16x4_t kC1 = vld1_s16(kCoeff1);
374 const int16x4_t kC2 = vld1_s16(kCoeff2);
375 for (int y = 8; y > 0; --y) {
376 int16x8_t r, g, b;
377 RGB24PackedToPlanar(data, &r, &g, &b);
378 ToYUV_8(r, g, b, kC1, kC2, out);
379 out += 8;
380 data += step;
381 }
382 }
383
384 // Convert 16x16 RGB samples to YUV420
ToY_16x16(const int16x8_t r,const int16x8_t g,const int16x8_t b,int16_t * const y_out,int16x8_t * const R_acc,int16x8_t * const G_acc,int16x8_t * const B_acc,const int16x4_t coeffs,bool do_add)385 static inline void ToY_16x16(const int16x8_t r,
386 const int16x8_t g,
387 const int16x8_t b,
388 int16_t* const y_out,
389 int16x8_t* const R_acc,
390 int16x8_t* const G_acc,
391 int16x8_t* const B_acc,
392 const int16x4_t coeffs,
393 bool do_add) {
394 int16x8_t Y;
395 ConvertRGBToY(r, g, b, coeffs, &Y);
396 vst1q_s16(y_out, Y);
397 if (do_add) {
398 *R_acc = vaddq_s16(*R_acc, r);
399 *G_acc = vaddq_s16(*G_acc, g);
400 *B_acc = vaddq_s16(*B_acc, b);
401 } else { // just store
402 *R_acc = r;
403 *G_acc = g;
404 *B_acc = b;
405 }
406 }
407
ToUV_8x8(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16_t * const uv_out)408 static inline void ToUV_8x8(const int16x8_t R,
409 const int16x8_t G,
410 const int16x8_t B,
411 const int16x4_t coeffs,
412 int16_t* const uv_out) {
413 int16x8_t U, V;
414 ConvertRGBToUVAccumulated(R, G, B, coeffs, &U, &V);
415 vst1q_s16(uv_out + 0 * 64, U);
416 vst1q_s16(uv_out + 1 * 64, V);
417 }
418
Condense16To8(const int16x8_t acc1,int16x8_t * const acc2)419 static void Condense16To8(const int16x8_t acc1, int16x8_t* const acc2) {
420 const int32x4_t lo = vpaddlq_s16(acc1);
421 const int32x4_t hi = vpaddlq_s16(*acc2);
422 *acc2 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi)); // pack-saturate
423 }
424
425 // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
Get16x8_NEON(const uint8_t * src1,int src_stride,int16_t y[4* 64],int16_t uv[2* 64])426 static void Get16x8_NEON(const uint8_t* src1, int src_stride,
427 int16_t y[4 * 64], int16_t uv[2 * 64]) {
428 const int16x4_t kC1 = vld1_s16(kCoeff1);
429 const int16x4_t kC2 = vld1_s16(kCoeff2);
430 for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
431 int16x8_t r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
432 int16x8_t r, g, b;
433 const uint8_t* const src2 = src1 + src_stride;
434 RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
435 ToY_16x16(r, g, b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, kC1, false);
436 RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
437 ToY_16x16(r, g, b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, kC1, false);
438 RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
439 ToY_16x16(r, g, b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, kC1, true);
440 RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
441 ToY_16x16(r, g, b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, kC1, true);
442 Condense16To8(r_acc1, &r_acc2);
443 Condense16To8(g_acc1, &g_acc2);
444 Condense16To8(b_acc1, &b_acc2);
445 ToUV_8x8(r_acc2, g_acc2, b_acc2, kC2, uv);
446 y += 2 * 8;
447 uv += 8;
448 }
449 }
450
Get16x16Block_NEON(const uint8_t * data,int step,int16_t * yuv)451 static void Get16x16Block_NEON(const uint8_t* data, int step, int16_t* yuv) {
452 int16_t* const uv = yuv + 4 * 64;
453 Get16x8_NEON(data + 0 * step, step, yuv + 0 * 64, uv + 0 * 8);
454 Get16x8_NEON(data + 8 * step, step, yuv + 2 * 64, uv + 4 * 8);
455 }
456
457 #undef MULT_S32_S16_LARGE
458 #undef DOT_PROD_PREAMBLE
459 #undef DOT_PROD1
460 #undef DOT_PROD2
461
462 #endif // SJPEG_USE_NEON
463
464 ///////////////////////////////////////////////////////////////////////////////
465 // C-version
466
467 // convert rgb_in[3] RGB triplet to Y, and accumulate in *rgb_sum
ToY(const uint8_t * const rgb_in,int * const rgb_sum)468 static inline int16_t ToY(const uint8_t* const rgb_in, int* const rgb_sum) {
469 const int r = rgb_in[0];
470 const int g = rgb_in[1];
471 const int b = rgb_in[2];
472 rgb_sum[0] += r;
473 rgb_sum[1] += g;
474 rgb_sum[2] += b;
475 const int y = 19595 * r + 38469 * g + 7471 * b + ROUND_Y;
476 return static_cast<int16_t>(y >> FRAC);
477 }
478
479 // convert sum of four rgb triplets to U
ToU(const int * const rgb)480 static inline int16_t ToU(const int* const rgb) {
481 const int u = -11059 * rgb[0] - 21709 * rgb[1] + 32768 * rgb[2] + ROUND_UV;
482 return static_cast<int16_t>(u >> (FRAC + 2));
483 }
484
485 // convert sum of four rgb triplets to V
ToV(const int * const rgb)486 static inline int16_t ToV(const int* const rgb) {
487 const int v = 32768 * rgb[0] - 27439 * rgb[1] - 5329 * rgb[2] + ROUND_UV;
488 return static_cast<int16_t>(v >> (FRAC + 2));
489 }
490
491 // for 4:4:4 conversion: convert rgb[3] to yuv
ToYUV(const uint8_t * const rgb,int16_t * const out)492 static inline void ToYUV(const uint8_t* const rgb, int16_t* const out) {
493 const int r = rgb[0];
494 const int g = rgb[1];
495 const int b = rgb[2];
496 const int y = 19595 * r + 38469 * g + 7471 * b + ROUND_Y;
497 const int u = -11059 * r - 21709 * g + 32768 * b + HALF;
498 const int v = 32768 * r - 27439 * g - 5329 * b + HALF;
499 out[0 * 64] = static_cast<int16_t>(y >> FRAC);
500 out[1 * 64] = static_cast<int16_t>(u >> FRAC);
501 out[2 * 64] = static_cast<int16_t>(v >> FRAC);
502 }
503
Get8x8Block_C(const uint8_t * data,int step,int16_t * out)504 static void Get8x8Block_C(const uint8_t* data, int step, int16_t* out) {
505 for (int y = 8; y > 0; --y) {
506 for (int x = 0; x < 8; ++x) {
507 ToYUV(data + 3 * x, out + x);
508 }
509 out += 8;
510 data += step;
511 }
512 }
513
Get16x8Block_C(const uint8_t * src1,int src_stride,int16_t yblock[4* 64],int16_t uvblock[2* 64])514 void Get16x8Block_C(const uint8_t* src1, int src_stride,
515 int16_t yblock[4 * 64], int16_t uvblock[2 * 64]) {
516 for (int y = 8; y > 0; y -= 2) {
517 const uint8_t* const src2 = src1 + src_stride;
518 for (int x = 0; x < 4; ++x) {
519 int rgb[2][3];
520 memset(rgb, 0, sizeof(rgb));
521 yblock[2 * x ] = ToY(src1 + 6 * x, rgb[0]);
522 yblock[2 * x + 1] = ToY(src1 + 6 * x + 3, rgb[0]);
523 yblock[2 * x + 8] = ToY(src2 + 6 * x, rgb[0]);
524 yblock[2 * x + 9] = ToY(src2 + 6 * x + 3, rgb[0]);
525 uvblock[0 * 64 + x] = ToU(rgb[0]);
526 uvblock[1 * 64 + x] = ToV(rgb[0]);
527 yblock[2 * x + 64] = ToY(src1 + 3 * 8 + 6 * x, rgb[1]);
528 yblock[2 * x + 1 + 64] = ToY(src1 + 3 * 8 + 6 * x + 3, rgb[1]);
529 yblock[2 * x + 8 + 64] = ToY(src2 + 3 * 8 + 6 * x, rgb[1]);
530 yblock[2 * x + 9 + 64] = ToY(src2 + 3 * 8 + 6 * x + 3, rgb[1]);
531 uvblock[0 * 64 + x + 4] = ToU(rgb[1]);
532 uvblock[1 * 64 + x + 4] = ToV(rgb[1]);
533 }
534 yblock += 2 * 8;
535 uvblock += 8;
536 src1 += 2 * src_stride;
537 }
538 }
539
Get16x16Block_C(const uint8_t * rgb,int step,int16_t * yuv)540 static void Get16x16Block_C(const uint8_t* rgb, int step, int16_t* yuv) {
541 Get16x8Block_C(rgb + 0 * step, step, yuv + 0 * 64, yuv + 4 * 64 + 0 * 8);
542 Get16x8Block_C(rgb + 8 * step, step, yuv + 2 * 64, yuv + 4 * 64 + 4 * 8);
543 }
544
545 ///////////////////////////////////////////////////////////////////////////////
546
GetBlockFunc(bool use_444)547 RGBToYUVBlockFunc GetBlockFunc(bool use_444) {
548 #if defined(SJPEG_USE_SSE2)
549 if (SupportsSSE2()) return use_444 ? Get8x8Block_SSE2
550 : Get16x16Block_SSE2;
551 #elif defined(SJPEG_USE_NEON)
552 if (SupportsNEON()) return use_444 ? Get8x8Block_NEON
553 : Get16x16Block_NEON;
554 #endif
555 return use_444 ? Get8x8Block_C : Get16x16Block_C; // default
556 }
557
558 ///////////////////////////////////////////////////////////////////////////////
559
560 namespace {
561
clip_8b(int v)562 uint8_t clip_8b(int v) {
563 return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
564 }
565
ToY(int r,int g,int b)566 int ToY(int r, int g, int b) {
567 const int luma = 19595 * r + 38469 * g + 7471 * b;
568 return (luma + HALF) >> FRAC; // no need to clip
569 }
570
clip_uv(int v)571 uint32_t clip_uv(int v) {
572 return clip_8b(128 + ((v + HALF) >> FRAC));
573 }
574
ToU(int r,int g,int b)575 uint32_t ToU(int r, int g, int b) {
576 const int u = -11059 * r - 21709 * g + 32768 * b;
577 return clip_uv(u);
578 }
579
ToV(int r,int g,int b)580 uint32_t ToV(int r, int g, int b) {
581 const int v = 32768 * r - 27439 * g - 5329 * b;
582 return clip_uv(v);
583 }
584
585 // (X * 0x0101 >> 16) ~= X / 255
Convert(uint32_t v)586 uint32_t Convert(uint32_t v) {
587 return (v * (0x0101u * (sjpeg::kRGBSize - 1))) >> 16;
588 }
589
ConvertToYUVIndex(const uint8_t * const rgb)590 int ConvertToYUVIndex(const uint8_t* const rgb) {
591 const int r = rgb[0];
592 const int g = rgb[1];
593 const int b = rgb[2];
594 const uint32_t y = Convert(ToY(r, g, b));
595 const uint32_t u = Convert(ToU(r, g, b));
596 const uint32_t v = Convert(ToV(r, g, b));
597 return (y + u * sjpeg::kRGBSize + v * sjpeg::kRGBSize * sjpeg::kRGBSize);
598 }
599
RowToIndexC(const uint8_t * rgb,int width,uint16_t * dst)600 void RowToIndexC(const uint8_t* rgb, int width, uint16_t* dst) {
601 for (int i = 0; i < width; ++i, rgb += 3) {
602 dst[i] = ConvertToYUVIndex(rgb);
603 }
604 }
605
606 #if defined(SJPEG_USE_SSE2)
RowToIndexSSE2(const uint8_t * rgb,int width,uint16_t * dst)607 void RowToIndexSSE2(const uint8_t* rgb, int width, uint16_t* dst) {
608 const __m128i zero = _mm_setzero_si128();
609 const __m128i mult = _mm_set1_epi16(0x0101u * (sjpeg::kRGBSize - 1));
610 const __m128i mult1 = _mm_set1_epi16(sjpeg::kRGBSize);
611 const __m128i mult2 = _mm_set1_epi16(sjpeg::kRGBSize * sjpeg::kRGBSize);
612 const __m128i k255 = _mm_set1_epi16(255);
613 while (width >= 8) {
614 __m128i r, g, b;
615 __m128i Y, U, V;
616 RGB24PackedToPlanar(rgb, &r, &g, &b);
617 ConvertRGBToY(&r, &g, &b, 0, &Y);
618 ConvertRGBToUV(&r, &g, &b, 128, &U, &V);
619 // clamping to [0, 255]
620 const __m128i y1 = _mm_min_epi16(_mm_max_epi16(Y, zero), k255);
621 const __m128i u1 = _mm_min_epi16(_mm_max_epi16(U, zero), k255);
622 const __m128i v1 = _mm_min_epi16(_mm_max_epi16(V, zero), k255);
623 // convert to idx
624 const __m128i y2 = _mm_mulhi_epi16(y1, mult);
625 const __m128i u2 = _mm_mulhi_epi16(u1, mult);
626 const __m128i v2 = _mm_mulhi_epi16(v1, mult);
627 // store final idx
628 const __m128i u3 = _mm_mullo_epi16(u2, mult1);
629 const __m128i v3 = _mm_mullo_epi16(v2, mult2);
630 const __m128i tmp = _mm_add_epi16(y2, u3);
631 const __m128i idx = _mm_add_epi16(tmp, v3);
632 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), idx);
633
634 rgb += 3 * 8;
635 dst += 8;
636 width -= 8;
637 }
638 if (width > 0) RowToIndexC(rgb, width, dst);
639 }
640 #elif defined(SJPEG_USE_NEON)
RowToIndexNEON(const uint8_t * rgb,int width,uint16_t * dst)641 void RowToIndexNEON(const uint8_t* rgb, int width, uint16_t* dst) {
642 const int16x8_t k128 = vdupq_n_s16(128);
643 const uint8x8_t mult = vdup_n_u8(sjpeg::kRGBSize - 1);
644 const uint16x8_t mult1 = vdupq_n_u16(sjpeg::kRGBSize);
645 const uint16x8_t mult2 = vdupq_n_u16(sjpeg::kRGBSize * sjpeg::kRGBSize);
646 const int16x4_t coeffs1 = vld1_s16(kCoeff1);
647 const int16x4_t coeffs2 = vld1_s16(kCoeff2);
648 while (width >= 8) {
649 int16x8_t r, g, b;
650 int16x8_t Y, U, V;
651 RGB24PackedToPlanar(rgb, &r, &g, &b);
652 ConvertRGBToY(r, g, b, coeffs1, &Y);
653 ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
654 // clamping to [0, 255]
655 const uint8x8_t y1 = vqmovun_s16(vaddq_s16(Y, k128));
656 const uint8x8_t u1 = vqmovun_s16(vaddq_s16(U, k128));
657 const uint8x8_t v1 = vqmovun_s16(vaddq_s16(V, k128));
658 // convert to idx
659 const uint16x8_t y2 = vmull_u8(y1, mult);
660 const uint16x8_t u2 = vmull_u8(u1, mult);
661 const uint16x8_t v2 = vmull_u8(v1, mult);
662 // divide by 255 using v/255 = (v * 0x0101) >> 16
663 const uint16x8_t y3 = vshrq_n_u16(vsraq_n_u16(y2, y2, 8), 8);
664 const uint16x8_t u3 = vshrq_n_u16(vsraq_n_u16(u2, u2, 8), 8);
665 const uint16x8_t v3 = vshrq_n_u16(vsraq_n_u16(v2, v2, 8), 8);
666 // store final idx
667 const uint16x8_t tmp = vmlaq_u16(y3, u3, mult1);
668 const uint16x8_t idx = vmlaq_u16(tmp, v3, mult2);
669 vst1q_u16(dst, idx);
670
671 rgb += 3 * 8;
672 dst += 8;
673 width -= 8;
674 }
675 if (width > 0) RowToIndexC(rgb, width, dst);
676 }
677 #endif // SJPEG_USE_NEON
678
679 } // namespace
680
681
GetRowFunc()682 RGBToIndexRowFunc GetRowFunc() {
683 #if defined(SJPEG_USE_SSE2)
684 if (SupportsSSE2()) return RowToIndexSSE2;
685 #elif defined(SJPEG_USE_NEON)
686 if (SupportsNEON()) return RowToIndexNEON;
687 #endif
688 return RowToIndexC;
689 }
690
691 } // namespace sjpeg
692