1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //  RGB -> YUV conversion
16 //
17 // Y       = ( 19595 * r + 38469 * g +  7471 * b + HALF) >> FRAC
18 // U - 128 = (-11059 * r - 21709 * g + 32768 * b + HALF) >> FRAC
19 // V - 128 = ( 32768 * r - 27439 * g -  5329 * b + HALF) >> FRAC
20 //
21 // Author: Skal (pascal.massimino@gmail.com)
22 
23 #include <string.h>
24 
25 #define SJPEG_NEED_ASM_HEADERS
26 #include "sjpegi.h"
27 
28 namespace sjpeg {
29 
30 // global fixed-point precision
31 enum { FRAC = 16, HALF = 1 << FRAC >> 1,
32        ROUND_UV = (HALF << 2), ROUND_Y = HALF - (128 << FRAC) };
33 
34 #if defined(SJPEG_USE_SSE2)
35 
36 // Load eight 16b-words from *src.
37 #define LOAD_16(src) _mm_loadu_si128(reinterpret_cast<const __m128i*>(src))
38 // Store eight 16b-words into *dst
39 #define STORE_16(V, dst) _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), (V))
40 
41 // Convert 8 packed RGB samples to r[], g[], b[]
RGB24PackedToPlanar(const uint8_t * const rgb,__m128i * const r,__m128i * const g,__m128i * const b)42 static inline void RGB24PackedToPlanar(const uint8_t* const rgb,
43                                        __m128i* const r,
44                                        __m128i* const g,
45                                        __m128i* const b) {
46   const __m128i zero = _mm_setzero_si128();
47   // in0: r0 g0 b0 r1 | g1 b1 r2 g2 | b2 r3 g3 b3 | r4 g4 b4 r5
48   // in1: b2 r3 g3 b3 | r4 g4 b4 r5 | g5 b5 r6 g6 | b6 r7 g7 b7
49   const __m128i in0 = LOAD_16(rgb + 0);
50   const __m128i in1 = LOAD_16(rgb + 8);
51   // A0: | r2 g2 b2 r3 | g3 b3 r4 g4 | b4 r5 ...
52   // A1:                   ... b2 r3 | g3 b3 r4 g4 | b4 r5 g5 b5 |
53   const __m128i A0 = _mm_srli_si128(in0, 6);
54   const __m128i A1 = _mm_slli_si128(in1, 6);
55   // B0: r0 r2 g0 g2 | b0 b2 r1 r3 | g1 g3 b1 b3 | r2 r4 b2 b4
56   // B1: g3 g5 b3 b5 | r4 r6 g4 g6 | b4 b6 r5 r7 | g5 g7 b5 b7
57   const __m128i B0 = _mm_unpacklo_epi8(in0, A0);
58   const __m128i B1 = _mm_unpackhi_epi8(A1, in1);
59   // C0: r1 r3 g1 g3 | b1 b3 r2 r4 | b2 b4 ...
60   // C1:                 ... g3 g5 | b3 b5 r4 r6 | g4 g6 b4 b6
61   const __m128i C0 = _mm_srli_si128(B0, 6);
62   const __m128i C1 = _mm_slli_si128(B1, 6);
63   // D0: r0 r1 r2 r3 | g0 g1 g2 g3 | b0 b1 b2 b3 | r1 r2 r3 r4
64   // D1: b3 b4 b5 b6 | r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 |
65   const __m128i D0 = _mm_unpacklo_epi8(B0, C0);
66   const __m128i D1 = _mm_unpackhi_epi8(C1, B1);
67   // r4 r5 r6 r7 | g4 g5 g6 g7 | b4 b5 b6 b7 | 0
68   const __m128i D2 = _mm_srli_si128(D1, 4);
69   // r0 r1 r2 r3 | r4 r5 r6 r7 | g0 g1 g2 g3 | g4 g5 g6 g7
70   const __m128i E0 = _mm_unpacklo_epi32(D0, D2);
71   // b0 b1 b2 b3 | b4 b5 b6 b7 | r1 r2 r3 r4 | 0
72   const __m128i E1 = _mm_unpackhi_epi32(D0, D2);
73   // g0 g1 g2 g3 | g4 g5 g6 g7 | 0
74   const __m128i E2 = _mm_srli_si128(E0, 8);
75   const __m128i F0 = _mm_unpacklo_epi8(E0, zero);  // -> R
76   const __m128i F1 = _mm_unpacklo_epi8(E1, zero);  // -> B
77   const __m128i F2 = _mm_unpacklo_epi8(E2, zero);  // -> G
78   *r = F0;
79   *b = F1;
80   *g = F2;
81 }
82 
83 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
84 // It's a macro and not a function because we need to use immediate values with
85 // srai_epi32, e.g.
86 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
87                   ROUNDER, DESCALE_FIX, ADD_OR_SUB, OUT) do {   \
88   const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
89   const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
90   const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
91   const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
92   const __m128i V2_lo = ADD_OR_SUB(V0_lo, V1_lo);               \
93   const __m128i V2_hi = ADD_OR_SUB(V0_hi, V1_hi);               \
94   const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
95   const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
96   const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
97   const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
98   (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
99 } while (0)
100 
101 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
102 
ConvertRGBToY(const __m128i * const R,const __m128i * const G,const __m128i * const B,int offset,__m128i * const Y)103 inline void ConvertRGBToY(const __m128i* const R,
104                           const __m128i* const G,
105                           const __m128i* const B,
106                           int offset,
107                           __m128i* const Y) {
108   const __m128i kRG_y = MK_CST_16(19595, 38469 - 16384);
109   const __m128i kGB_y = MK_CST_16(16384, 7471);
110   const __m128i kROUND_Y = _mm_set1_epi32(HALF + (offset << FRAC));
111   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
112   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
113   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
114   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
115   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kROUND_Y, FRAC,
116             _mm_add_epi32, *Y);
117 }
118 
ConvertRGBToUV(const __m128i * const R,const __m128i * const G,const __m128i * const B,int offset,__m128i * const U,__m128i * const V)119 inline void ConvertRGBToUV(const __m128i* const R,
120                            const __m128i* const G,
121                            const __m128i* const B,
122                            int offset,
123                            __m128i* const U, __m128i* const V) {
124   // Warning! 32768 is overflowing int16, so we're actually multiplying
125   // by -32768 instead of 32768. We compensate by subtracting the result
126   // instead of adding, thus restoring the sign.
127   const __m128i kRG_u = MK_CST_16(-11059, -21709);
128   const __m128i kGB_u = MK_CST_16(0, -32768);
129   const __m128i kRG_v = MK_CST_16(-32768, 0);
130   const __m128i kGB_v = MK_CST_16(-27439, -5329);
131   const __m128i kRound = _mm_set1_epi32((offset << FRAC) + HALF);
132 
133   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
134   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
135   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
136   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
137 
138   // _mm_sub_epi32 -> sign restore!
139   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, kRound, FRAC,
140             _mm_sub_epi32, *U);
141   // note! GB and RG are inverted, for sign-restoration
142   TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v, kRound, FRAC,
143             _mm_sub_epi32, *V);
144 }
145 
146 // This version takes four accumulated R/G/B samples. Hence, the
147 // descaling factor is FRAC + 2.
ConvertRGBToUVAccumulated(const __m128i * const R,const __m128i * const G,const __m128i * const B,__m128i * const U,__m128i * const V)148 inline void ConvertRGBToUVAccumulated(const __m128i* const R,
149                                       const __m128i* const G,
150                                       const __m128i* const B,
151                                       __m128i* const U, __m128i* const V) {
152   // Warning! 32768 is overflowing int16, so we're actually multiplying
153   // by -32768 instead of 32768. We compensate by subtracting the result
154   // instead of adding, thus restoring the sign.
155   const __m128i kRG_u = MK_CST_16(-11059, -21709);
156   const __m128i kGB_u = MK_CST_16(0, -32768);
157   const __m128i kRG_v = MK_CST_16(-32768, 0);
158   const __m128i kGB_v = MK_CST_16(-27439, -5329);
159   const __m128i kRound = _mm_set1_epi32(ROUND_UV);
160 
161   const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
162   const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
163   const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
164   const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
165 
166   // _mm_sub_epi32 -> sign restore!
167   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
168             kRound, FRAC + 2, _mm_sub_epi32, *U);
169   // note! GB and RG are inverted, for sign-restoration
170   TRANSFORM(GB_lo, GB_hi, RG_lo, RG_hi, kGB_v, kRG_v,
171             kRound, FRAC + 2, _mm_sub_epi32, *V);
172 }
173 
174 #undef MK_CST_16
175 #undef TRANSFORM
176 
177 // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
ToYUV_8(const __m128i * const r,const __m128i * const g,const __m128i * const b,int16_t * const out)178 static inline void ToYUV_8(const __m128i* const r,
179                            const __m128i* const g,
180                            const __m128i* const b,
181                            int16_t* const out) {
182   __m128i Y, U, V;
183   ConvertRGBToY(r, g, b, -128, &Y);
184   ConvertRGBToUV(r, g, b, 0, &U, &V);
185   STORE_16(Y, out + 0 * 64);
186   STORE_16(U, out + 1 * 64);
187   STORE_16(V, out + 2 * 64);
188 }
189 
Get8x8Block_SSE2(const uint8_t * data,int step,int16_t * out)190 static void Get8x8Block_SSE2(const uint8_t* data, int step, int16_t* out) {
191   for (int y = 8; y > 0; --y) {
192     __m128i r, g, b;
193     RGB24PackedToPlanar(data, &r, &g, &b);
194     ToYUV_8(&r, &g, &b, out);
195     out += 8;
196     data += step;
197   }
198 }
199 
200 // Convert 16x16 RGB samples to YUV420
ToY_16x16(const __m128i * const r,const __m128i * const g,const __m128i * const b,int16_t * const y_out,__m128i * const R_acc,__m128i * const G_acc,__m128i * const B_acc,bool do_add)201 static inline void ToY_16x16(const __m128i* const r,
202                              const __m128i* const g,
203                              const __m128i* const b,
204                              int16_t* const y_out,
205                              __m128i* const R_acc,
206                              __m128i* const G_acc,
207                              __m128i* const B_acc,
208                              bool do_add) {
209   __m128i Y;
210   ConvertRGBToY(r, g, b, -128, &Y);
211   STORE_16(Y, y_out);
212   if (do_add) {
213     *R_acc = _mm_add_epi16(*R_acc, *r);
214     *G_acc = _mm_add_epi16(*G_acc, *g);
215     *B_acc = _mm_add_epi16(*B_acc, *b);
216   } else {  // just store
217     *R_acc = *r;
218     *G_acc = *g;
219     *B_acc = *b;
220   }
221 }
222 
ToUV_8x8(const __m128i * const R,const __m128i * const G,const __m128i * const B,int16_t * const uv_out)223 static inline void ToUV_8x8(const __m128i* const R,
224                             const __m128i* const G,
225                             const __m128i* const B,
226                             int16_t* const uv_out) {
227   __m128i U, V;
228   ConvertRGBToUVAccumulated(R, G, B, &U, &V);
229   STORE_16(U, uv_out + 0 * 64);
230   STORE_16(V, uv_out + 1 * 64);
231 }
232 
Condense16To8(const __m128i * const acc1,__m128i * const acc2)233 static void Condense16To8(const __m128i* const acc1, __m128i* const acc2) {
234   const __m128i one = _mm_set1_epi16(1);
235   const __m128i tmp1 = _mm_madd_epi16(*acc1, one);
236   const __m128i tmp2 = _mm_madd_epi16(*acc2, one);
237   *acc2 = _mm_packs_epi32(tmp1, tmp2);
238 }
239 
240 // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
Get16x8_SSE2(const uint8_t * src1,int src_stride,int16_t y[4* 64],int16_t uv[2* 64])241 static void Get16x8_SSE2(const uint8_t* src1, int src_stride,
242                          int16_t y[4 * 64], int16_t uv[2 * 64]) {
243   for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
244     __m128i r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
245     __m128i r, g, b;
246     const uint8_t* const src2 = src1 + src_stride;
247     RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
248     ToY_16x16(&r, &g, &b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, false);
249     RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
250     ToY_16x16(&r, &g, &b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, false);
251     RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
252     ToY_16x16(&r, &g, &b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, true);
253     RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
254     ToY_16x16(&r, &g, &b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, true);
255     Condense16To8(&r_acc1, &r_acc2);
256     Condense16To8(&g_acc1, &g_acc2);
257     Condense16To8(&b_acc1, &b_acc2);
258     ToUV_8x8(&r_acc2, &g_acc2, &b_acc2, uv);
259     y += 2 * 8;
260     uv += 8;
261   }
262 }
263 
Get16x16Block_SSE2(const uint8_t * data,int step,int16_t * blocks)264 static void Get16x16Block_SSE2(const uint8_t* data, int step, int16_t* blocks) {
265   Get16x8_SSE2(data + 0 * step, step, blocks + 0 * 64, blocks + 4 * 64 + 0 * 8);
266   Get16x8_SSE2(data + 8 * step, step, blocks + 2 * 64, blocks + 4 * 64 + 4 * 8);
267 }
268 
269 #undef LOAD_16
270 #undef STORE_16
271 
272 #endif    // SJPEG_USE_SSE2
273 
274 ///////////////////////////////////////////////////////////////////////////////
275 // NEON-version for 8x8 and 16x16 blocks
276 
277 #if defined(SJPEG_USE_NEON)
278 
279 static const int16_t kCoeff1[4] = { (int16_t)38469, 19595, 7471, 0 };
280 static const int16_t kCoeff2[4] = { 21709, 11059, 27439, 5329 };
281 
282 // Convert 8 packed RGB or BGR samples to r[], g[], b[]
RGB24PackedToPlanar(const uint8_t * const rgb,int16x8_t * const r,int16x8_t * const g,int16x8_t * const b)283 static void RGB24PackedToPlanar(const uint8_t* const rgb,
284                                 int16x8_t* const r,
285                                 int16x8_t* const g,
286                                 int16x8_t* const b) {
287   const uint8x8x3_t in = vld3_u8(rgb);
288   *r = vreinterpretq_s16_u16(vmovl_u8(in.val[0]));
289   *g = vreinterpretq_s16_u16(vmovl_u8(in.val[1]));
290   *b = vreinterpretq_s16_u16(vmovl_u8(in.val[2]));
291 }
292 
293 // s16->s32 widening multiply with large (>=32768) coeff requires special care:
294 #define MULT_S32_S16_LARGE(S16, COEFF, LANE)                                 \
295   vreinterpretq_s32_u32(vmull_lane_u16(vreinterpret_u16_s16(S16),            \
296                                        vreinterpret_u16_s16(COEFF), (LANE)))
297 
ConvertRGBToY(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const Y)298 static inline void ConvertRGBToY(const int16x8_t R,
299                                  const int16x8_t G,
300                                  const int16x8_t B,
301                                  const int16x4_t coeffs,
302                                  int16x8_t* const Y) {
303   int32x4_t lo = MULT_S32_S16_LARGE(vget_low_s16(G), coeffs, 0);
304   int32x4_t hi = MULT_S32_S16_LARGE(vget_high_s16(G), coeffs, 0);
305   lo = vmlal_lane_s16(lo, vget_low_s16(R), coeffs, 1);
306   hi = vmlal_lane_s16(hi, vget_high_s16(R), coeffs, 1);
307   lo = vmlal_lane_s16(lo, vget_low_s16(B), coeffs, 2);
308   hi = vmlal_lane_s16(hi, vget_high_s16(B), coeffs, 2);
309   const int16x8_t V0 = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, FRAC)),
310                                     vqmovn_s32(vrshrq_n_s32(hi, FRAC)));
311   *Y = vsubq_s16(V0, vdupq_n_s16(128));
312 }
313 
314 // Compute ((V0<<15) - V1 * C1 - V2 * C2 + round) >> SHIFT
315 #define DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2)                  \
316   int32x4_t lo, hi;                                                         \
317   do {                                                                      \
318     lo = vshll_n_s16(vget_low_s16(V0), 15);                                 \
319     hi = vshll_n_s16(vget_high_s16(V0), 15);                                \
320     lo = vmlsl_lane_s16(lo, vget_low_s16(V1), COEFF, LANE1);                \
321     hi = vmlsl_lane_s16(hi, vget_high_s16(V1), COEFF, LANE1);               \
322     lo = vmlsl_lane_s16(lo, vget_low_s16(V2), COEFF, LANE2);                \
323     hi = vmlsl_lane_s16(hi, vget_high_s16(V2), COEFF, LANE2);               \
324 } while (0)
325 
326 // This version assumes SHIFT <= 16
327 #define DOT_PROD1(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do {         \
328   assert(SHIFT <= 16);                                                      \
329   DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2);                       \
330   (OUT) = vcombine_s16(vrshrn_n_s32(lo, SHIFT), vrshrn_n_s32(hi, SHIFT));   \
331 } while (0)
332 
333 // alternate version for SHIFT > 16
334 #define DOT_PROD2(V0, V1, V2, COEFF, LANE1, LANE2, SHIFT, OUT) do {         \
335   assert(SHIFT > 16);                                                       \
336   DOT_PROD_PREAMBLE(V0, V1, V2, COEFF, LANE1, LANE2);                       \
337   (OUT) = vcombine_s16(vqmovn_s32(vrshrq_n_s32(lo, SHIFT)),                 \
338                        vqmovn_s32(vrshrq_n_s32(hi, SHIFT)));                \
339 } while (0)
340 
ConvertRGBToUV(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const U,int16x8_t * const V)341 static inline void ConvertRGBToUV(const int16x8_t R,
342                                   const int16x8_t G,
343                                   const int16x8_t B,
344                                   const int16x4_t coeffs,
345                                   int16x8_t* const U, int16x8_t* const V) {
346   DOT_PROD1(B, G, R, coeffs, 0, 1, FRAC, *U);
347   DOT_PROD1(R, G, B, coeffs, 2, 3, FRAC, *V);
348 }
349 
ConvertRGBToUVAccumulated(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16x8_t * const U,int16x8_t * const V)350 static inline void ConvertRGBToUVAccumulated(const int16x8_t R,
351                                              const int16x8_t G,
352                                              const int16x8_t B,
353                                              const int16x4_t coeffs,
354                                              int16x8_t* const U,
355                                              int16x8_t* const V) {
356   DOT_PROD2(B, G, R, coeffs, 0, 1, FRAC + 2, *U);
357   DOT_PROD2(R, G, B, coeffs, 2, 3, FRAC + 2, *V);
358 }
359 
360 // Convert 8 RGB samples to YUV. out[] points to a 3*64 data block.
ToYUV_8(const int16x8_t r,const int16x8_t g,const int16x8_t b,const int16x4_t coeffs1,const int16x4_t coeffs2,int16_t * const out)361 static void ToYUV_8(const int16x8_t r, const int16x8_t g, const int16x8_t b,
362                     const int16x4_t coeffs1, const int16x4_t coeffs2,
363                     int16_t* const out) {
364   int16x8_t Y, U, V;
365   ConvertRGBToY(r, g, b, coeffs1, &Y);
366   ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
367   vst1q_s16(out + 0 * 64, Y);
368   vst1q_s16(out + 1 * 64, U);
369   vst1q_s16(out + 2 * 64, V);
370 }
371 
Get8x8Block_NEON(const uint8_t * data,int step,int16_t * out)372 static void Get8x8Block_NEON(const uint8_t* data, int step, int16_t* out) {
373   const int16x4_t kC1 = vld1_s16(kCoeff1);
374   const int16x4_t kC2 = vld1_s16(kCoeff2);
375   for (int y = 8; y > 0; --y) {
376     int16x8_t r, g, b;
377     RGB24PackedToPlanar(data, &r, &g, &b);
378     ToYUV_8(r, g, b, kC1, kC2, out);
379     out += 8;
380     data += step;
381   }
382 }
383 
384 // Convert 16x16 RGB samples to YUV420
ToY_16x16(const int16x8_t r,const int16x8_t g,const int16x8_t b,int16_t * const y_out,int16x8_t * const R_acc,int16x8_t * const G_acc,int16x8_t * const B_acc,const int16x4_t coeffs,bool do_add)385 static inline void ToY_16x16(const int16x8_t r,
386                              const int16x8_t g,
387                              const int16x8_t b,
388                              int16_t* const y_out,
389                              int16x8_t* const R_acc,
390                              int16x8_t* const G_acc,
391                              int16x8_t* const B_acc,
392                              const int16x4_t coeffs,
393                              bool do_add) {
394   int16x8_t Y;
395   ConvertRGBToY(r, g, b, coeffs, &Y);
396   vst1q_s16(y_out, Y);
397   if (do_add) {
398     *R_acc = vaddq_s16(*R_acc, r);
399     *G_acc = vaddq_s16(*G_acc, g);
400     *B_acc = vaddq_s16(*B_acc, b);
401   } else {  // just store
402     *R_acc = r;
403     *G_acc = g;
404     *B_acc = b;
405   }
406 }
407 
ToUV_8x8(const int16x8_t R,const int16x8_t G,const int16x8_t B,const int16x4_t coeffs,int16_t * const uv_out)408 static inline void ToUV_8x8(const int16x8_t R,
409                             const int16x8_t G,
410                             const int16x8_t B,
411                             const int16x4_t coeffs,
412                             int16_t* const uv_out) {
413   int16x8_t U, V;
414   ConvertRGBToUVAccumulated(R, G, B, coeffs, &U, &V);
415   vst1q_s16(uv_out + 0 * 64, U);
416   vst1q_s16(uv_out + 1 * 64, V);
417 }
418 
Condense16To8(const int16x8_t acc1,int16x8_t * const acc2)419 static void Condense16To8(const int16x8_t acc1, int16x8_t* const acc2) {
420   const int32x4_t lo = vpaddlq_s16(acc1);
421   const int32x4_t hi = vpaddlq_s16(*acc2);
422   *acc2 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi));  // pack-saturate
423 }
424 
425 // convert two 16x8 RGB blocks into two blocks of luma, and 2 blocks of U/V
Get16x8_NEON(const uint8_t * src1,int src_stride,int16_t y[4* 64],int16_t uv[2* 64])426 static void Get16x8_NEON(const uint8_t* src1, int src_stride,
427                          int16_t y[4 * 64], int16_t uv[2 * 64]) {
428   const int16x4_t kC1 = vld1_s16(kCoeff1);
429   const int16x4_t kC2 = vld1_s16(kCoeff2);
430   for (int i = 4; i > 0; --i, src1 += 2 * src_stride) {
431     int16x8_t r_acc1, r_acc2, g_acc1, g_acc2, b_acc1, b_acc2;
432     int16x8_t r, g, b;
433     const uint8_t* const src2 = src1 + src_stride;
434     RGB24PackedToPlanar(src1 + 0 * 8, &r, &g, &b);
435     ToY_16x16(r, g, b, y + 0 * 64 + 0, &r_acc1, &g_acc1, &b_acc1, kC1, false);
436     RGB24PackedToPlanar(src1 + 3 * 8, &r, &g, &b);
437     ToY_16x16(r, g, b, y + 1 * 64 + 0, &r_acc2, &g_acc2, &b_acc2, kC1, false);
438     RGB24PackedToPlanar(src2 + 0 * 8, &r, &g, &b);
439     ToY_16x16(r, g, b, y + 0 * 64 + 8, &r_acc1, &g_acc1, &b_acc1, kC1, true);
440     RGB24PackedToPlanar(src2 + 3 * 8, &r, &g, &b);
441     ToY_16x16(r, g, b, y + 1 * 64 + 8, &r_acc2, &g_acc2, &b_acc2, kC1, true);
442     Condense16To8(r_acc1, &r_acc2);
443     Condense16To8(g_acc1, &g_acc2);
444     Condense16To8(b_acc1, &b_acc2);
445     ToUV_8x8(r_acc2, g_acc2, b_acc2, kC2, uv);
446     y += 2 * 8;
447     uv += 8;
448   }
449 }
450 
Get16x16Block_NEON(const uint8_t * data,int step,int16_t * yuv)451 static void Get16x16Block_NEON(const uint8_t* data, int step, int16_t* yuv) {
452   int16_t* const uv = yuv + 4 * 64;
453   Get16x8_NEON(data + 0 * step, step, yuv + 0 * 64, uv + 0 * 8);
454   Get16x8_NEON(data + 8 * step, step, yuv + 2 * 64, uv + 4 * 8);
455 }
456 
457 #undef MULT_S32_S16_LARGE
458 #undef DOT_PROD_PREAMBLE
459 #undef DOT_PROD1
460 #undef DOT_PROD2
461 
462 #endif    // SJPEG_USE_NEON
463 
464 ///////////////////////////////////////////////////////////////////////////////
465 // C-version
466 
467 // convert rgb_in[3] RGB triplet to Y, and accumulate in *rgb_sum
ToY(const uint8_t * const rgb_in,int * const rgb_sum)468 static inline int16_t ToY(const uint8_t* const rgb_in, int* const rgb_sum) {
469   const int r = rgb_in[0];
470   const int g = rgb_in[1];
471   const int b = rgb_in[2];
472   rgb_sum[0] += r;
473   rgb_sum[1] += g;
474   rgb_sum[2] += b;
475   const int y = 19595 * r + 38469 * g + 7471 * b + ROUND_Y;
476   return static_cast<int16_t>(y >> FRAC);
477 }
478 
479 // convert sum of four rgb triplets to U
ToU(const int * const rgb)480 static inline int16_t ToU(const int* const rgb) {
481   const int u = -11059 * rgb[0] - 21709 * rgb[1] + 32768 * rgb[2] + ROUND_UV;
482   return static_cast<int16_t>(u >> (FRAC + 2));
483 }
484 
485 // convert sum of four rgb triplets to V
ToV(const int * const rgb)486 static inline int16_t ToV(const int* const rgb) {
487   const int v = 32768 * rgb[0] - 27439 * rgb[1] -  5329 * rgb[2] + ROUND_UV;
488   return static_cast<int16_t>(v >> (FRAC + 2));
489 }
490 
491 // for 4:4:4 conversion: convert rgb[3] to yuv
ToYUV(const uint8_t * const rgb,int16_t * const out)492 static inline void ToYUV(const uint8_t* const rgb, int16_t* const out) {
493   const int r = rgb[0];
494   const int g = rgb[1];
495   const int b = rgb[2];
496   const int y =  19595 * r + 38469 * g +  7471 * b + ROUND_Y;
497   const int u = -11059 * r - 21709 * g + 32768 * b + HALF;
498   const int v =  32768 * r - 27439 * g -  5329 * b + HALF;
499   out[0 * 64] = static_cast<int16_t>(y >> FRAC);
500   out[1 * 64] = static_cast<int16_t>(u >> FRAC);
501   out[2 * 64] = static_cast<int16_t>(v >> FRAC);
502 }
503 
Get8x8Block_C(const uint8_t * data,int step,int16_t * out)504 static void Get8x8Block_C(const uint8_t* data, int step, int16_t* out) {
505   for (int y = 8; y > 0; --y) {
506     for (int x = 0; x < 8; ++x) {
507       ToYUV(data + 3 * x, out + x);
508     }
509     out += 8;
510     data += step;
511   }
512 }
513 
Get16x8Block_C(const uint8_t * src1,int src_stride,int16_t yblock[4* 64],int16_t uvblock[2* 64])514 void Get16x8Block_C(const uint8_t* src1, int src_stride,
515                     int16_t yblock[4 * 64], int16_t uvblock[2 * 64]) {
516   for (int y = 8; y > 0; y -= 2) {
517     const uint8_t* const src2 = src1 + src_stride;
518     for (int x = 0; x < 4; ++x) {
519       int rgb[2][3];
520       memset(rgb, 0, sizeof(rgb));
521       yblock[2 * x    ] = ToY(src1 + 6 * x,     rgb[0]);
522       yblock[2 * x + 1] = ToY(src1 + 6 * x + 3, rgb[0]);
523       yblock[2 * x + 8] = ToY(src2 + 6 * x,     rgb[0]);
524       yblock[2 * x + 9] = ToY(src2 + 6 * x + 3, rgb[0]);
525       uvblock[0 * 64 + x] = ToU(rgb[0]);
526       uvblock[1 * 64 + x] = ToV(rgb[0]);
527       yblock[2 * x     + 64] = ToY(src1 + 3 * 8 + 6 * x,     rgb[1]);
528       yblock[2 * x + 1 + 64] = ToY(src1 + 3 * 8 + 6 * x + 3, rgb[1]);
529       yblock[2 * x + 8 + 64] = ToY(src2 + 3 * 8 + 6 * x,     rgb[1]);
530       yblock[2 * x + 9 + 64] = ToY(src2 + 3 * 8 + 6 * x + 3, rgb[1]);
531       uvblock[0 * 64 + x + 4] = ToU(rgb[1]);
532       uvblock[1 * 64 + x + 4] = ToV(rgb[1]);
533     }
534     yblock += 2 * 8;
535     uvblock += 8;
536     src1 += 2 * src_stride;
537   }
538 }
539 
Get16x16Block_C(const uint8_t * rgb,int step,int16_t * yuv)540 static void Get16x16Block_C(const uint8_t* rgb, int step, int16_t* yuv) {
541   Get16x8Block_C(rgb + 0 * step, step, yuv + 0 * 64, yuv + 4 * 64 + 0 * 8);
542   Get16x8Block_C(rgb + 8 * step, step, yuv + 2 * 64, yuv + 4 * 64 + 4 * 8);
543 }
544 
545 ///////////////////////////////////////////////////////////////////////////////
546 
GetBlockFunc(bool use_444)547 RGBToYUVBlockFunc GetBlockFunc(bool use_444) {
548 #if defined(SJPEG_USE_SSE2)
549   if (SupportsSSE2()) return use_444 ? Get8x8Block_SSE2
550                                      : Get16x16Block_SSE2;
551 #elif defined(SJPEG_USE_NEON)
552   if (SupportsNEON()) return use_444 ? Get8x8Block_NEON
553                                      : Get16x16Block_NEON;
554 #endif
555   return use_444 ? Get8x8Block_C : Get16x16Block_C;  // default
556 }
557 
558 ///////////////////////////////////////////////////////////////////////////////
559 
560 namespace {
561 
clip_8b(int v)562 uint8_t clip_8b(int v) {
563   return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
564 }
565 
ToY(int r,int g,int b)566 int ToY(int r, int g, int b) {
567   const int luma = 19595 * r + 38469 * g + 7471 * b;
568   return (luma + HALF) >> FRAC;  // no need to clip
569 }
570 
clip_uv(int v)571 uint32_t clip_uv(int v) {
572   return clip_8b(128 + ((v + HALF) >> FRAC));
573 }
574 
ToU(int r,int g,int b)575 uint32_t ToU(int r, int g, int b) {
576   const int u = -11059 * r - 21709 * g + 32768 * b;
577   return clip_uv(u);
578 }
579 
ToV(int r,int g,int b)580 uint32_t ToV(int r, int g, int b) {
581   const int v = 32768 * r - 27439 * g - 5329 * b;
582   return clip_uv(v);
583 }
584 
585 // (X * 0x0101 >> 16) ~= X / 255
Convert(uint32_t v)586 uint32_t Convert(uint32_t v) {
587   return (v * (0x0101u * (sjpeg::kRGBSize - 1))) >> 16;
588 }
589 
ConvertToYUVIndex(const uint8_t * const rgb)590 int ConvertToYUVIndex(const uint8_t* const rgb) {
591   const int r = rgb[0];
592   const int g = rgb[1];
593   const int b = rgb[2];
594   const uint32_t y = Convert(ToY(r, g, b));
595   const uint32_t u = Convert(ToU(r, g, b));
596   const uint32_t v = Convert(ToV(r, g, b));
597   return (y + u * sjpeg::kRGBSize + v * sjpeg::kRGBSize * sjpeg::kRGBSize);
598 }
599 
RowToIndexC(const uint8_t * rgb,int width,uint16_t * dst)600 void RowToIndexC(const uint8_t* rgb, int width, uint16_t* dst) {
601   for (int i = 0; i < width; ++i, rgb += 3) {
602     dst[i] = ConvertToYUVIndex(rgb);
603   }
604 }
605 
606 #if defined(SJPEG_USE_SSE2)
RowToIndexSSE2(const uint8_t * rgb,int width,uint16_t * dst)607 void RowToIndexSSE2(const uint8_t* rgb, int width, uint16_t* dst) {
608   const __m128i zero = _mm_setzero_si128();
609   const __m128i mult = _mm_set1_epi16(0x0101u * (sjpeg::kRGBSize - 1));
610   const __m128i mult1 = _mm_set1_epi16(sjpeg::kRGBSize);
611   const __m128i mult2 = _mm_set1_epi16(sjpeg::kRGBSize * sjpeg::kRGBSize);
612   const __m128i k255 = _mm_set1_epi16(255);
613   while (width >= 8) {
614     __m128i r, g, b;
615     __m128i Y, U, V;
616     RGB24PackedToPlanar(rgb, &r, &g, &b);
617     ConvertRGBToY(&r, &g, &b, 0, &Y);
618     ConvertRGBToUV(&r, &g, &b, 128, &U, &V);
619     // clamping to [0, 255]
620     const __m128i y1 = _mm_min_epi16(_mm_max_epi16(Y, zero), k255);
621     const __m128i u1 = _mm_min_epi16(_mm_max_epi16(U, zero), k255);
622     const __m128i v1 = _mm_min_epi16(_mm_max_epi16(V, zero), k255);
623     // convert to idx
624     const __m128i y2 = _mm_mulhi_epi16(y1, mult);
625     const __m128i u2 = _mm_mulhi_epi16(u1, mult);
626     const __m128i v2 = _mm_mulhi_epi16(v1, mult);
627     // store final idx
628     const __m128i u3 = _mm_mullo_epi16(u2, mult1);
629     const __m128i v3 = _mm_mullo_epi16(v2, mult2);
630     const __m128i tmp = _mm_add_epi16(y2, u3);
631     const __m128i idx = _mm_add_epi16(tmp, v3);
632     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), idx);
633 
634     rgb += 3 * 8;
635     dst += 8;
636     width -= 8;
637   }
638   if (width > 0) RowToIndexC(rgb, width, dst);
639 }
640 #elif defined(SJPEG_USE_NEON)
RowToIndexNEON(const uint8_t * rgb,int width,uint16_t * dst)641 void RowToIndexNEON(const uint8_t* rgb, int width, uint16_t* dst) {
642   const int16x8_t k128 = vdupq_n_s16(128);
643   const uint8x8_t mult = vdup_n_u8(sjpeg::kRGBSize - 1);
644   const uint16x8_t mult1 = vdupq_n_u16(sjpeg::kRGBSize);
645   const uint16x8_t mult2 = vdupq_n_u16(sjpeg::kRGBSize * sjpeg::kRGBSize);
646   const int16x4_t coeffs1 = vld1_s16(kCoeff1);
647   const int16x4_t coeffs2 = vld1_s16(kCoeff2);
648   while (width >= 8) {
649     int16x8_t r, g, b;
650     int16x8_t Y, U, V;
651     RGB24PackedToPlanar(rgb, &r, &g, &b);
652     ConvertRGBToY(r, g, b, coeffs1, &Y);
653     ConvertRGBToUV(r, g, b, coeffs2, &U, &V);
654     // clamping to [0, 255]
655     const uint8x8_t y1 = vqmovun_s16(vaddq_s16(Y, k128));
656     const uint8x8_t u1 = vqmovun_s16(vaddq_s16(U, k128));
657     const uint8x8_t v1 = vqmovun_s16(vaddq_s16(V, k128));
658     // convert to idx
659     const uint16x8_t y2 = vmull_u8(y1, mult);
660     const uint16x8_t u2 = vmull_u8(u1, mult);
661     const uint16x8_t v2 = vmull_u8(v1, mult);
662     // divide by 255 using v/255 = (v * 0x0101) >> 16
663     const uint16x8_t y3 = vshrq_n_u16(vsraq_n_u16(y2, y2, 8), 8);
664     const uint16x8_t u3 = vshrq_n_u16(vsraq_n_u16(u2, u2, 8), 8);
665     const uint16x8_t v3 = vshrq_n_u16(vsraq_n_u16(v2, v2, 8), 8);
666     // store final idx
667     const uint16x8_t tmp = vmlaq_u16(y3, u3, mult1);
668     const uint16x8_t idx = vmlaq_u16(tmp, v3, mult2);
669     vst1q_u16(dst, idx);
670 
671     rgb += 3 * 8;
672     dst += 8;
673     width -= 8;
674   }
675   if (width > 0) RowToIndexC(rgb, width, dst);
676 }
677 #endif    // SJPEG_USE_NEON
678 
679 }  // namespace
680 
681 
GetRowFunc()682 RGBToIndexRowFunc GetRowFunc() {
683 #if defined(SJPEG_USE_SSE2)
684   if (SupportsSSE2()) return RowToIndexSSE2;
685 #elif defined(SJPEG_USE_NEON)
686   if (SupportsNEON()) return RowToIndexNEON;
687 #endif
688   return RowToIndexC;
689 }
690 
691 }   // namespace sjpeg
692