1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //  Enhanced RGB->YUV conversion functions
16 //
17 // Author: Skal (pascal.massimino@gmail.com)
18 
19 #include <math.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <memory>
23 #include <vector>
24 using std::vector;
25 
26 #define SJPEG_NEED_ASM_HEADERS
27 #include "sjpegi.h"
28 
29 namespace sjpeg {
30 
31 // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
32 // banding sometimes. Better use extra precision.
33 #define SFIX 2                // fixed-point precision of RGB and Y/W
34 #define SHALF (1 << SFIX >> 1)
35 #define MAX_Y_T ((256 << SFIX) - 1)
36 typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
37 typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
38 
clip_y(int y)39 static fixed_y_t clip_y(int y) {
40   return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
41 }
42 
43 ////////////////////////////////////////////////////////////////////////////////
44 // Helper functions for Y/U/V fixed-point calculations.
45 
46 // The following functions convert r/g/b values in SFIX fixed-point precision
47 // to 8b values, clipped:
48 #define YUV_FIX 16
49 #define TFIX (YUV_FIX + SFIX)
50 #define TROUNDER (1 << TFIX >> 1)
51 
clip_8b(int v)52 static uint8_t clip_8b(int v) {
53   return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
54 }
55 
ConvertRGBToY(int r,int g,int b)56 static uint8_t ConvertRGBToY(int r, int g, int b) {
57   const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
58   return clip_8b(luma >> TFIX);
59 }
60 
ConvertRGBToU(int r,int g,int b)61 static uint8_t ConvertRGBToU(int r, int g, int b) {
62   const int u =  -11058 * r - 21709 * g + 32768 * b + TROUNDER;
63   return clip_8b(128 + (u >> TFIX));
64 }
65 
ConvertRGBToV(int r,int g,int b)66 static uint8_t ConvertRGBToV(int r, int g, int b) {
67   const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
68   return clip_8b(128 + (v >> TFIX));
69 }
70 
71 // convert to luma using 16b precision:
ConvertRowToY(const uint8_t * row,int w,uint8_t * const dst)72 static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
73   for (int i = 0; i < w; i += 1, row += 3) {
74     const int r = row[0], g = row[1], b = row[2];
75     const int y = 19595 * r + 38469 * g + 7471 * b;
76     dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
77   }
78 }
79 
ConvertRowToUV(const uint8_t * row1,const uint8_t * row2,int w,uint8_t * u,uint8_t * v)80 static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
81                            int w, uint8_t* u, uint8_t* v) {
82   for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
83     const int r = row1[0] + row1[3] + row2[0] + row2[3];
84     const int g = row1[1] + row1[4] + row2[1] + row2[4];
85     const int b = row1[2] + row1[5] + row2[2] + row2[5];
86     *u++ = ConvertRGBToU(r, g, b);
87     *v++ = ConvertRGBToV(r, g, b);
88   }
89   if (w & 1) {
90     const int r = 2 * (row1[0] + row2[0]);
91     const int g = 2 * (row1[1] + row2[1]);
92     const int b = 2 * (row1[2] + row2[2]);
93     *u++ = ConvertRGBToU(r, g, b);
94     *v++ = ConvertRGBToV(r, g, b);
95   }
96 }
97 
98 #undef TFIX
99 #undef ROUNDER
100 
101 ////////////////////////////////////////////////////////////////////////////////
102 // Sharp RGB->YUV conversion
103 
104 static const int kNumIterations = 4;
105 static const int kMinDimensionIterativeConversion = 4;
106 
107 // size of the interpolation table for linear-to-gamma
108 #define GAMMA_TABLE_SIZE 32
109 static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
110 #define GAMMA_TO_LINEAR_BITS 14
111 static uint32_t kGammaToLinearTab[MAX_Y_T + 1];   // size scales with Y_FIX
112 
InitGammaTablesF(void)113 static void InitGammaTablesF(void) {
114   static bool done = false;
115   assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
116   if (!done) {
117     int v;
118     const double norm = 1. / MAX_Y_T;
119     const double scale = 1. / GAMMA_TABLE_SIZE;
120     const double a = 0.099;
121     const double thresh = 0.018;
122     const double gamma = 1. / 0.45;
123     const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
124     for (v = 0; v <= MAX_Y_T; ++v) {
125       const double g = norm * v;
126       double value;
127       if (g <= thresh * 4.5) {
128         value = g / 4.5;
129       } else {
130         const double a_rec = 1. / (1. + a);
131         value = pow(a_rec * (g + a), gamma);
132       }
133       kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
134     }
135     for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
136       const double g = scale * v;
137       double value;
138       if (g <= thresh) {
139         value = 4.5 * g;
140       } else {
141         value = (1. + a) * pow(g, 1. / gamma) - a;
142       }
143       // we already incorporate the 1/2 rounding constant here
144       kLinearToGammaTab[v] =
145           static_cast<uint32_t>(MAX_Y_T * value)
146             + (1 << GAMMA_TO_LINEAR_BITS >> 1);
147     }
148     // to prevent small rounding errors to cause read-overflow:
149     kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
150         kLinearToGammaTab[GAMMA_TABLE_SIZE];
151     done = true;
152   }
153 }
154 
155 // return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
GammaToLinear(int v)156 static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
157 
LinearToGamma(uint32_t value)158 static uint32_t LinearToGamma(uint32_t value) {
159   // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
160   const uint32_t v = value * GAMMA_TABLE_SIZE;
161   const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
162   // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
163   const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
164   // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
165   const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
166   const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
167   // Final interpolation. Note that rounding is already included.
168   const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
169   const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
170   return result;
171 }
172 
173 //------------------------------------------------------------------------------
174 
SharpUpdateY_C(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)175 static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
176                                uint16_t* dst, int len) {
177   uint64_t diff = 0;
178   for (int i = 0; i < len; ++i) {
179     const int diff_y = ref[i] - src[i];
180     const int new_y = static_cast<int>(dst[i]) + diff_y;
181     dst[i] = clip_y(new_y);
182     diff += (uint64_t)abs(diff_y);
183   }
184   return diff;
185 }
186 
SharpUpdateRGB_C(const int16_t * ref,const int16_t * src,int16_t * dst,int len)187 static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
188                              int16_t* dst, int len) {
189   for (int i = 0; i < len; ++i) {
190     const int diff_uv = ref[i] - src[i];
191     dst[i] += diff_uv;
192   }
193 }
194 
SharpFilterRow_C(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)195 static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
196                              const uint16_t* best_y, uint16_t* out) {
197   for (int i = 0; i < len; ++i, ++A, ++B) {
198     const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
199     const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
200     out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
201     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
202   }
203 }
204 
205 #if defined(SJPEG_USE_SSE2)
206 
207 #define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
208 #define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
209 
SharpUpdateY_SSE2(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)210 static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
211                                   uint16_t* dst, int len) {
212   uint64_t diff = 0;
213   uint32_t tmp[4];
214   int i;
215   const __m128i zero = _mm_setzero_si128();
216   const __m128i max = _mm_set1_epi16(MAX_Y_T);
217   const __m128i one = _mm_set1_epi16(1);
218   __m128i sum = zero;
219 
220   for (i = 0; i + 8 <= len; i += 8) {
221     const __m128i A = LOAD_16(ref + i);
222     const __m128i B = LOAD_16(src + i);
223     const __m128i C = LOAD_16(dst + i);
224     const __m128i D = _mm_sub_epi16(A, B);       // diff_y
225     const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
226     const __m128i F = _mm_add_epi16(C, D);       // new_y
227     const __m128i G = _mm_or_si128(E, one);      // -1 or 1
228     const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
229     const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
230     STORE_16(dst + i, H);
231     sum = _mm_add_epi32(sum, I);
232   }
233   STORE_16(tmp, sum);
234   diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
235   for (; i < len; ++i) {
236     const int diff_y = ref[i] - src[i];
237     const int new_y = static_cast<int>(dst[i]) + diff_y;
238     dst[i] = clip_y(new_y);
239     diff += (uint64_t)abs(diff_y);
240   }
241   return diff;
242 }
243 
SharpUpdateRGB_SSE2(const int16_t * ref,const int16_t * src,int16_t * dst,int len)244 static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
245                                 int16_t* dst, int len) {
246   int i = 0;
247   for (i = 0; i + 8 <= len; i += 8) {
248     const __m128i A = LOAD_16(ref + i);
249     const __m128i B = LOAD_16(src + i);
250     const __m128i C = LOAD_16(dst + i);
251     const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
252     const __m128i E = _mm_add_epi16(C, D);   // new_uv
253     STORE_16(dst + i, E);
254   }
255   for (; i < len; ++i) {
256     const int diff_uv = ref[i] - src[i];
257     dst[i] += diff_uv;
258   }
259 }
260 
SharpFilterRow_SSE2(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)261 static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
262                                 const uint16_t* best_y, uint16_t* out) {
263   int i;
264   const __m128i kCst8 = _mm_set1_epi16(8);
265   const __m128i max = _mm_set1_epi16(MAX_Y_T);
266   const __m128i zero = _mm_setzero_si128();
267   for (i = 0; i + 8 <= len; i += 8) {
268     const __m128i a0 = LOAD_16(A + i + 0);
269     const __m128i a1 = LOAD_16(A + i + 1);
270     const __m128i b0 = LOAD_16(B + i + 0);
271     const __m128i b1 = LOAD_16(B + i + 1);
272     const __m128i a0b1 = _mm_add_epi16(a0, b1);
273     const __m128i a1b0 = _mm_add_epi16(a1, b0);
274     const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
275     const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
276     const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
277     const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
278     const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
279     const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
280     const __m128i d0 = _mm_add_epi16(c1, a0);
281     const __m128i d1 = _mm_add_epi16(c0, a1);
282     const __m128i e0 = _mm_srai_epi16(d0, 1);
283     const __m128i e1 = _mm_srai_epi16(d1, 1);
284     const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
285     const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
286     const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
287     const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
288     const __m128i h0 = _mm_add_epi16(g0, f0);
289     const __m128i h1 = _mm_add_epi16(g1, f1);
290     const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
291     const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
292     STORE_16(out + 2 * i + 0, i0);
293     STORE_16(out + 2 * i + 8, i1);
294   }
295   for (; i < len; ++i) {
296     //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
297     // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
298     // We reuse the common sub-expressions.
299     const int a0b1 = A[i + 0] + B[i + 1];
300     const int a1b0 = A[i + 1] + B[i + 0];
301     const int a0a1b0b1 = a0b1 + a1b0 + 8;
302     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
303     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
304     out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
305     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
306   }
307 }
308 #undef STORE_16
309 #undef LOAD_16
310 
311 #elif defined(SJPEG_USE_NEON)
312 
SharpUpdateY_NEON(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)313 static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
314                                   uint16_t* dst, int len) {
315   int i;
316   const int16x8_t zero = vdupq_n_s16(0);
317   const int16x8_t max = vdupq_n_s16(MAX_Y_T);
318   uint64x2_t sum = vdupq_n_u64(0);
319 
320   for (i = 0; i + 8 <= len; i += 8) {
321     const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
322     const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
323     const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
324     const int16x8_t D = vsubq_s16(A, B);       // diff_y
325     const int16x8_t F = vaddq_s16(C, D);       // new_y
326     const uint16x8_t H =
327         vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
328     const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
329     vst1q_u16(dst + i, H);
330     sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
331   }
332   uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
333   for (; i < len; ++i) {
334     const int diff_y = ref[i] - src[i];
335     const int new_y = static_cast<int>(dst[i]) + diff_y;
336     dst[i] = clip_y(new_y);
337     diff += static_cast<uint64_t>(abs(diff_y));
338   }
339   return diff;
340 }
341 
SharpUpdateRGB_NEON(const int16_t * ref,const int16_t * src,int16_t * dst,int len)342 static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
343                                 int16_t* dst, int len) {
344   int i;
345   for (i = 0; i + 8 <= len; i += 8) {
346     const int16x8_t A = vld1q_s16(ref + i);
347     const int16x8_t B = vld1q_s16(src + i);
348     const int16x8_t C = vld1q_s16(dst + i);
349     const int16x8_t D = vsubq_s16(A, B);   // diff_uv
350     const int16x8_t E = vaddq_s16(C, D);   // new_uv
351     vst1q_s16(dst + i, E);
352   }
353   for (; i < len; ++i) {
354     const int diff_uv = ref[i] - src[i];
355     dst[i] += diff_uv;
356   }
357 }
358 
SharpFilterRow_NEON(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)359 static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
360                                 const uint16_t* best_y, uint16_t* out) {
361   int i;
362   const int16x8_t max = vdupq_n_s16(MAX_Y_T);
363   const int16x8_t zero = vdupq_n_s16(0);
364   for (i = 0; i + 8 <= len; i += 8) {
365     const int16x8_t a0 = vld1q_s16(A + i + 0);
366     const int16x8_t a1 = vld1q_s16(A + i + 1);
367     const int16x8_t b0 = vld1q_s16(B + i + 0);
368     const int16x8_t b1 = vld1q_s16(B + i + 1);
369     const int16x8_t a0b1 = vaddq_s16(a0, b1);
370     const int16x8_t a1b0 = vaddq_s16(a1, b0);
371     const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
372     const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
373     const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
374     const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
375     const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
376     const int16x8_t d0 = vaddq_s16(c1, a0);
377     const int16x8_t d1 = vaddq_s16(c0, a1);
378     const int16x8_t e0 = vrshrq_n_s16(d0, 1);
379     const int16x8_t e1 = vrshrq_n_s16(d1, 1);
380     const int16x8x2_t f = vzipq_s16(e0, e1);
381     const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
382     const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
383     const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
384     const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
385     const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
386     const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
387     vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
388     vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
389   }
390   for (; i < len; ++i) {
391     const int a0b1 = A[i + 0] + B[i + 1];
392     const int a1b0 = A[i + 1] + B[i + 0];
393     const int a0a1b0b1 = a0b1 + a1b0 + 8;
394     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
395     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
396     out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
397     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
398   }
399 }
400 
401 #endif    // SJPEG_USE_NEON
402 
403 static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
404                                  uint16_t* dst, int len);
405 static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
406                                int16_t* dst, int len);
407 static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
408                                int len, const uint16_t* best_y, uint16_t* out);
409 
InitFunctionPointers()410 static void InitFunctionPointers() {
411   static bool done = false;
412   if (!done) {
413     kSharpUpdateY = SharpUpdateY_C;
414     kSharpUpdateRGB = SharpUpdateRGB_C;
415     kSharpFilterRow = SharpFilterRow_C;
416 #if defined(SJPEG_USE_SSE2)
417     if (sjpeg::SupportsSSE2()) {
418       kSharpUpdateY = SharpUpdateY_SSE2;
419       kSharpUpdateRGB = SharpUpdateRGB_SSE2;
420       kSharpFilterRow = SharpFilterRow_SSE2;
421     }
422 #endif
423 #if defined(SJPEG_USE_NEON)
424     if (sjpeg::SupportsNEON()) {
425       kSharpUpdateY = SharpUpdateY_NEON;
426       kSharpUpdateRGB = SharpUpdateRGB_NEON;
427       kSharpFilterRow = SharpFilterRow_NEON;
428     }
429 #endif
430     done = true;
431   }
432 }
433 
434 //------------------------------------------------------------------------------
435 
RGBToGray(uint32_t r,uint32_t g,uint32_t b)436 static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
437   const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
438   return (luma >> YUV_FIX);
439 }
440 
ScaleDown(int a,int b,int c,int d)441 static uint32_t ScaleDown(int a, int b, int c, int d) {
442   const uint32_t A = GammaToLinear(a);
443   const uint32_t B = GammaToLinear(b);
444   const uint32_t C = GammaToLinear(c);
445   const uint32_t D = GammaToLinear(d);
446   return LinearToGamma((A + B + C + D + 2) >> 2);
447 }
448 
UpdateChroma(const fixed_y_t * src1,const fixed_y_t * src2,fixed_t * dst,size_t uv_w)449 static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
450                          fixed_t* dst, size_t uv_w) {
451   for (size_t i = 0; i < uv_w; ++i) {
452     const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
453                                  src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
454     const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
455                                  src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
456     const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
457                                  src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
458     const int W = RGBToGray(r, g, b);
459     dst[0 * uv_w] = (fixed_t)(r - W);
460     dst[1 * uv_w] = (fixed_t)(g - W);
461     dst[2 * uv_w] = (fixed_t)(b - W);
462     dst  += 1;
463     src1 += 2;
464     src2 += 2;
465   }
466 }
467 
UpdateW(const fixed_y_t * src,fixed_y_t * dst,int w)468 static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
469   for (int i = 0; i < w; ++i) {
470     const uint32_t R = GammaToLinear(src[0 * w + i]);
471     const uint32_t G = GammaToLinear(src[1 * w + i]);
472     const uint32_t B = GammaToLinear(src[2 * w + i]);
473     const uint32_t Y = RGBToGray(R, G, B);
474     dst[i] = (fixed_y_t)LinearToGamma(Y);
475   }
476 }
477 
StoreGray(const fixed_y_t * const rgb,fixed_y_t * const y,int w)478 static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
479   for (int i = 0; i < w; ++i) {
480     y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
481   }
482 }
483 
484 //------------------------------------------------------------------------------
485 
Filter2(int A,int B,int W0)486 static fixed_y_t Filter2(int A, int B, int W0) {
487   const int v0 = (A * 3 + B + 2) >> 2;
488   return clip_y(v0 + W0);
489 }
490 
491 //------------------------------------------------------------------------------
492 
UpLift(uint8_t a)493 static fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
494   return ((fixed_y_t)a << SFIX) | SHALF;
495 }
496 
ImportOneRow(const uint8_t * const rgb,int pic_width,fixed_y_t * const dst)497 static void ImportOneRow(const uint8_t* const rgb, int pic_width,
498                          fixed_y_t* const dst) {
499   const int w = (pic_width + 1) & ~1;
500   for (int i = 0; i < pic_width; ++i) {
501     const int off = i * 3;
502     dst[i + 0 * w] = UpLift(rgb[off + 0]);
503     dst[i + 1 * w] = UpLift(rgb[off + 1]);
504     dst[i + 2 * w] = UpLift(rgb[off + 2]);
505   }
506   if (pic_width & 1) {  // replicate rightmost pixel
507     dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
508     dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
509     dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
510   }
511 }
512 
InterpolateTwoRows(const fixed_y_t * const best_y,const fixed_t * prev_uv,const fixed_t * cur_uv,const fixed_t * next_uv,int w,fixed_y_t * out1,fixed_y_t * out2)513 static void InterpolateTwoRows(const fixed_y_t* const best_y,
514                                const fixed_t* prev_uv,
515                                const fixed_t* cur_uv,
516                                const fixed_t* next_uv,
517                                int w,
518                                fixed_y_t* out1, fixed_y_t* out2) {
519   const int uv_w = w >> 1;
520   const int len = (w - 1) >> 1;   // length to filter
521   for (int k = 3; k > 0; --k) {  // process each R/G/B segments in turn
522     // special boundary case for i==0
523     out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
524     out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
525 
526     kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
527     kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
528 
529     // special boundary case for i == w - 1 when w is even
530     if (!(w & 1)) {
531       out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
532                             best_y[w - 1 + 0]);
533       out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
534                             best_y[w - 1 + w]);
535     }
536     out1 += w;
537     out2 += w;
538     prev_uv += uv_w;
539     cur_uv  += uv_w;
540     next_uv += uv_w;
541   }
542 }
543 
ConvertWRGBToYUV(const fixed_y_t * best_y,const fixed_t * best_uv,int width,int height,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)544 static void ConvertWRGBToYUV(const fixed_y_t* best_y,
545                              const fixed_t* best_uv,
546                              int width, int height,
547                              uint8_t* y_plane,
548                              uint8_t* u_plane, uint8_t* v_plane) {
549   const int w = (width + 1) & ~1;
550   const int h = (height + 1) & ~1;
551   const int uv_w = w >> 1;
552   const int uv_h = h >> 1;
553   for (int j = 0; j < height; ++j) {
554     const int off = (j >> 1) * 3 * uv_w;
555     for (int i = 0; i < width; ++i) {
556       const int W = best_y[i + j * w];
557       const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
558       const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
559       const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
560       y_plane[i] = ConvertRGBToY(r, g, b);
561     }
562     y_plane += width;
563   }
564   for (int j = 0; j < uv_h; ++j) {
565     for (int i = 0; i < uv_w; ++i) {
566       const int off = i + j * 3 * uv_w;
567       const int r = best_uv[off + 0 * uv_w];
568       const int g = best_uv[off + 1 * uv_w];
569       const int b = best_uv[off + 2 * uv_w];
570       u_plane[i] = ConvertRGBToU(r, g, b);
571       v_plane[i] = ConvertRGBToV(r, g, b);
572     }
573     u_plane += uv_w;
574     v_plane += uv_w;
575   }
576 }
577 
578 //------------------------------------------------------------------------------
579 // Main function
580 
PreprocessARGB(const uint8_t * const rgb,int width,int height,size_t stride,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)581 static void PreprocessARGB(const uint8_t* const rgb,
582                            int width, int height, size_t stride,
583                            uint8_t* y_plane,
584                            uint8_t* u_plane, uint8_t* v_plane) {
585   // we expand the right/bottom border if needed
586   const int w = (width + 1) & ~1;
587   const int h = (height + 1) & ~1;
588   const int uv_w = w >> 1;
589   const int uv_h = h >> 1;
590   uint64_t prev_diff_y_sum = ~0;
591 
592   InitGammaTablesF();
593   InitFunctionPointers();
594 
595   // TODO(skal): allocate one big memory chunk instead.
596   vector<fixed_y_t> tmp_buffer(w * 3 * 2);
597   vector<fixed_y_t> best_y(w * h);
598   vector<fixed_y_t> target_y(w * h);
599   vector<fixed_y_t> best_rgb_y(w * 2);
600   vector<fixed_t> best_uv(uv_w * 3 * uv_h);
601   vector<fixed_t> target_uv(uv_w * 3 * uv_h);
602   vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
603   const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
604 
605   assert(width >= kMinDimensionIterativeConversion);
606   assert(height >= kMinDimensionIterativeConversion);
607 
608   // Import RGB samples to W/RGB representation.
609   for (int j = 0; j < height; j += 2) {
610     const int is_last_row = (j == height - 1);
611     fixed_y_t* const src1 = &tmp_buffer[0 * w];
612     fixed_y_t* const src2 = &tmp_buffer[3 * w];
613     const int rgb_off = j * stride;
614     const int y_off = j * w;
615     const int uv_off = (j >> 1) * 3 * uv_w;
616 
617     // prepare two rows of input
618     ImportOneRow(rgb + rgb_off, width, src1);
619     if (!is_last_row) {
620       ImportOneRow(rgb + rgb_off + stride, width, src2);
621     } else {
622       memcpy(src2, src1, 3 * w * sizeof(*src2));
623     }
624     StoreGray(src1, &best_y[y_off + 0], w);
625     StoreGray(src2, &best_y[y_off + w], w);
626     UpdateW(src1, &target_y[y_off + 0], w);
627     UpdateW(src2, &target_y[y_off + w], w);
628     UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
629     memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
630   }
631 
632   // Iterate and resolve clipping conflicts.
633   for (int iter = 0; iter < kNumIterations; ++iter) {
634     const fixed_t* cur_uv = &best_uv[0];
635     const fixed_t* prev_uv = &best_uv[0];
636     uint64_t diff_y_sum = 0;
637 
638     for (int j = 0; j < h; j += 2) {
639       const int uv_off = (j >> 1) * 3 * uv_w;
640       fixed_y_t* const src1 = &tmp_buffer[0 * w];
641       fixed_y_t* const src2 = &tmp_buffer[3 * w];
642       const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
643       InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
644                          w, src1, src2);
645       prev_uv = cur_uv;
646       cur_uv = next_uv;
647 
648       UpdateW(src1, &best_rgb_y[0 * w], w);
649       UpdateW(src2, &best_rgb_y[1 * w], w);
650       UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
651 
652       // update two rows of Y and one row of RGB
653       diff_y_sum += kSharpUpdateY(&target_y[j * w],
654                                   &best_rgb_y[0], &best_y[j * w], 2 * w);
655       kSharpUpdateRGB(&target_uv[uv_off],
656                       &best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
657     }
658     // test exit condition
659     if (iter > 0) {
660       if (diff_y_sum < diff_y_threshold) break;
661       if (diff_y_sum > prev_diff_y_sum) break;
662     }
663     prev_diff_y_sum = diff_y_sum;
664   }
665   // final reconstruction
666   ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
667                    y_plane, u_plane, v_plane);
668 }
669 
670 }  // namespace sjpeg
671 
672 ////////////////////////////////////////////////////////////////////////////////
673 // Entry point
674 
ApplySharpYUVConversion(const uint8_t * const rgb,int W,int H,int stride,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)675 void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
676                                     int W, int H, int stride,
677                                     uint8_t* y_plane,
678                                     uint8_t* u_plane, uint8_t* v_plane) {
679   if (W <= kMinDimensionIterativeConversion ||
680       H <= kMinDimensionIterativeConversion) {
681     const int uv_w = (W + 1) >> 1;
682     for (int y = 0; y < H; y += 2) {
683       const uint8_t* const rgb1 = rgb + y * stride;
684       const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
685       ConvertRowToY(rgb1, W, &y_plane[y * W]);
686       if (y < H - 1) {
687         ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
688       }
689       ConvertRowToUV(rgb1, rgb2, W,
690                      &u_plane[(y >> 1) * uv_w],
691                      &v_plane[(y >> 1) * uv_w]);
692     }
693   } else {
694     PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
695   }
696 }
697 
698 ////////////////////////////////////////////////////////////////////////////////
699