1 // Copyright 2017 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Enhanced RGB->YUV conversion functions
16 //
17 // Author: Skal (pascal.massimino@gmail.com)
18
19 #include <math.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <memory>
23 #include <vector>
24 using std::vector;
25
26 #define SJPEG_NEED_ASM_HEADERS
27 #include "sjpegi.h"
28
29 namespace sjpeg {
30
31 // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
32 // banding sometimes. Better use extra precision.
33 #define SFIX 2 // fixed-point precision of RGB and Y/W
34 #define SHALF (1 << SFIX >> 1)
35 #define MAX_Y_T ((256 << SFIX) - 1)
36 typedef int16_t fixed_t; // signed type with extra SFIX precision for UV
37 typedef uint16_t fixed_y_t; // unsigned type with extra SFIX precision for W
38
clip_y(int y)39 static fixed_y_t clip_y(int y) {
40 return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
41 }
42
43 ////////////////////////////////////////////////////////////////////////////////
44 // Helper functions for Y/U/V fixed-point calculations.
45
46 // The following functions convert r/g/b values in SFIX fixed-point precision
47 // to 8b values, clipped:
48 #define YUV_FIX 16
49 #define TFIX (YUV_FIX + SFIX)
50 #define TROUNDER (1 << TFIX >> 1)
51
clip_8b(int v)52 static uint8_t clip_8b(int v) {
53 return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
54 }
55
ConvertRGBToY(int r,int g,int b)56 static uint8_t ConvertRGBToY(int r, int g, int b) {
57 const int luma = 19595 * r + 38469 * g + 7471 * b + TROUNDER;
58 return clip_8b(luma >> TFIX);
59 }
60
ConvertRGBToU(int r,int g,int b)61 static uint8_t ConvertRGBToU(int r, int g, int b) {
62 const int u = -11058 * r - 21709 * g + 32768 * b + TROUNDER;
63 return clip_8b(128 + (u >> TFIX));
64 }
65
ConvertRGBToV(int r,int g,int b)66 static uint8_t ConvertRGBToV(int r, int g, int b) {
67 const int v = +32768 * r - 27439 * g - 5328 * b + TROUNDER;
68 return clip_8b(128 + (v >> TFIX));
69 }
70
71 // convert to luma using 16b precision:
ConvertRowToY(const uint8_t * row,int w,uint8_t * const dst)72 static void ConvertRowToY(const uint8_t* row, int w, uint8_t* const dst) {
73 for (int i = 0; i < w; i += 1, row += 3) {
74 const int r = row[0], g = row[1], b = row[2];
75 const int y = 19595 * r + 38469 * g + 7471 * b;
76 dst[i] = (y + (1 << YUV_FIX >> 1)) >> YUV_FIX;
77 }
78 }
79
ConvertRowToUV(const uint8_t * row1,const uint8_t * row2,int w,uint8_t * u,uint8_t * v)80 static void ConvertRowToUV(const uint8_t* row1, const uint8_t* row2,
81 int w, uint8_t* u, uint8_t* v) {
82 for (int i = 0; i < (w & ~1); i += 2, row1 += 6, row2 += 6) {
83 const int r = row1[0] + row1[3] + row2[0] + row2[3];
84 const int g = row1[1] + row1[4] + row2[1] + row2[4];
85 const int b = row1[2] + row1[5] + row2[2] + row2[5];
86 *u++ = ConvertRGBToU(r, g, b);
87 *v++ = ConvertRGBToV(r, g, b);
88 }
89 if (w & 1) {
90 const int r = 2 * (row1[0] + row2[0]);
91 const int g = 2 * (row1[1] + row2[1]);
92 const int b = 2 * (row1[2] + row2[2]);
93 *u++ = ConvertRGBToU(r, g, b);
94 *v++ = ConvertRGBToV(r, g, b);
95 }
96 }
97
98 #undef TFIX
99 #undef ROUNDER
100
101 ////////////////////////////////////////////////////////////////////////////////
102 // Sharp RGB->YUV conversion
103
104 static const int kNumIterations = 4;
105 static const int kMinDimensionIterativeConversion = 4;
106
107 // size of the interpolation table for linear-to-gamma
108 #define GAMMA_TABLE_SIZE 32
109 static uint32_t kLinearToGammaTab[GAMMA_TABLE_SIZE + 2];
110 #define GAMMA_TO_LINEAR_BITS 14
111 static uint32_t kGammaToLinearTab[MAX_Y_T + 1]; // size scales with Y_FIX
112
InitGammaTablesF(void)113 static void InitGammaTablesF(void) {
114 static bool done = false;
115 assert(2 * GAMMA_TO_LINEAR_BITS < 32); // we use uint32_t intermediate values
116 if (!done) {
117 int v;
118 const double norm = 1. / MAX_Y_T;
119 const double scale = 1. / GAMMA_TABLE_SIZE;
120 const double a = 0.099;
121 const double thresh = 0.018;
122 const double gamma = 1. / 0.45;
123 const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
124 for (v = 0; v <= MAX_Y_T; ++v) {
125 const double g = norm * v;
126 double value;
127 if (g <= thresh * 4.5) {
128 value = g / 4.5;
129 } else {
130 const double a_rec = 1. / (1. + a);
131 value = pow(a_rec * (g + a), gamma);
132 }
133 kGammaToLinearTab[v] = static_cast<uint32_t>(value * final_scale + .5);
134 }
135 for (v = 0; v <= GAMMA_TABLE_SIZE; ++v) {
136 const double g = scale * v;
137 double value;
138 if (g <= thresh) {
139 value = 4.5 * g;
140 } else {
141 value = (1. + a) * pow(g, 1. / gamma) - a;
142 }
143 // we already incorporate the 1/2 rounding constant here
144 kLinearToGammaTab[v] =
145 static_cast<uint32_t>(MAX_Y_T * value)
146 + (1 << GAMMA_TO_LINEAR_BITS >> 1);
147 }
148 // to prevent small rounding errors to cause read-overflow:
149 kLinearToGammaTab[GAMMA_TABLE_SIZE + 1] =
150 kLinearToGammaTab[GAMMA_TABLE_SIZE];
151 done = true;
152 }
153 }
154
155 // return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
GammaToLinear(int v)156 static uint32_t GammaToLinear(int v) { return kGammaToLinearTab[v]; }
157
LinearToGamma(uint32_t value)158 static uint32_t LinearToGamma(uint32_t value) {
159 // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
160 const uint32_t v = value * GAMMA_TABLE_SIZE;
161 const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
162 // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
163 const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS); // fractional part
164 // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
165 const uint32_t v0 = kLinearToGammaTab[tab_pos + 0];
166 const uint32_t v1 = kLinearToGammaTab[tab_pos + 1];
167 // Final interpolation. Note that rounding is already included.
168 const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
169 const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
170 return result;
171 }
172
173 //------------------------------------------------------------------------------
174
SharpUpdateY_C(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)175 static uint64_t SharpUpdateY_C(const uint16_t* ref, const uint16_t* src,
176 uint16_t* dst, int len) {
177 uint64_t diff = 0;
178 for (int i = 0; i < len; ++i) {
179 const int diff_y = ref[i] - src[i];
180 const int new_y = static_cast<int>(dst[i]) + diff_y;
181 dst[i] = clip_y(new_y);
182 diff += (uint64_t)abs(diff_y);
183 }
184 return diff;
185 }
186
SharpUpdateRGB_C(const int16_t * ref,const int16_t * src,int16_t * dst,int len)187 static void SharpUpdateRGB_C(const int16_t* ref, const int16_t* src,
188 int16_t* dst, int len) {
189 for (int i = 0; i < len; ++i) {
190 const int diff_uv = ref[i] - src[i];
191 dst[i] += diff_uv;
192 }
193 }
194
SharpFilterRow_C(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)195 static void SharpFilterRow_C(const int16_t* A, const int16_t* B, int len,
196 const uint16_t* best_y, uint16_t* out) {
197 for (int i = 0; i < len; ++i, ++A, ++B) {
198 const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
199 const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
200 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
201 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
202 }
203 }
204
205 #if defined(SJPEG_USE_SSE2)
206
207 #define LOAD_16(P) (_mm_loadu_si128(reinterpret_cast<const __m128i*>(P)))
208 #define STORE_16(P, V) (_mm_storeu_si128(reinterpret_cast<__m128i*>(P), (V)))
209
SharpUpdateY_SSE2(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)210 static uint64_t SharpUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
211 uint16_t* dst, int len) {
212 uint64_t diff = 0;
213 uint32_t tmp[4];
214 int i;
215 const __m128i zero = _mm_setzero_si128();
216 const __m128i max = _mm_set1_epi16(MAX_Y_T);
217 const __m128i one = _mm_set1_epi16(1);
218 __m128i sum = zero;
219
220 for (i = 0; i + 8 <= len; i += 8) {
221 const __m128i A = LOAD_16(ref + i);
222 const __m128i B = LOAD_16(src + i);
223 const __m128i C = LOAD_16(dst + i);
224 const __m128i D = _mm_sub_epi16(A, B); // diff_y
225 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
226 const __m128i F = _mm_add_epi16(C, D); // new_y
227 const __m128i G = _mm_or_si128(E, one); // -1 or 1
228 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
229 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
230 STORE_16(dst + i, H);
231 sum = _mm_add_epi32(sum, I);
232 }
233 STORE_16(tmp, sum);
234 diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
235 for (; i < len; ++i) {
236 const int diff_y = ref[i] - src[i];
237 const int new_y = static_cast<int>(dst[i]) + diff_y;
238 dst[i] = clip_y(new_y);
239 diff += (uint64_t)abs(diff_y);
240 }
241 return diff;
242 }
243
SharpUpdateRGB_SSE2(const int16_t * ref,const int16_t * src,int16_t * dst,int len)244 static void SharpUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
245 int16_t* dst, int len) {
246 int i = 0;
247 for (i = 0; i + 8 <= len; i += 8) {
248 const __m128i A = LOAD_16(ref + i);
249 const __m128i B = LOAD_16(src + i);
250 const __m128i C = LOAD_16(dst + i);
251 const __m128i D = _mm_sub_epi16(A, B); // diff_uv
252 const __m128i E = _mm_add_epi16(C, D); // new_uv
253 STORE_16(dst + i, E);
254 }
255 for (; i < len; ++i) {
256 const int diff_uv = ref[i] - src[i];
257 dst[i] += diff_uv;
258 }
259 }
260
SharpFilterRow_SSE2(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)261 static void SharpFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
262 const uint16_t* best_y, uint16_t* out) {
263 int i;
264 const __m128i kCst8 = _mm_set1_epi16(8);
265 const __m128i max = _mm_set1_epi16(MAX_Y_T);
266 const __m128i zero = _mm_setzero_si128();
267 for (i = 0; i + 8 <= len; i += 8) {
268 const __m128i a0 = LOAD_16(A + i + 0);
269 const __m128i a1 = LOAD_16(A + i + 1);
270 const __m128i b0 = LOAD_16(B + i + 0);
271 const __m128i b1 = LOAD_16(B + i + 1);
272 const __m128i a0b1 = _mm_add_epi16(a0, b1);
273 const __m128i a1b0 = _mm_add_epi16(a1, b0);
274 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
275 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
276 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
277 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
278 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
279 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
280 const __m128i d0 = _mm_add_epi16(c1, a0);
281 const __m128i d1 = _mm_add_epi16(c0, a1);
282 const __m128i e0 = _mm_srai_epi16(d0, 1);
283 const __m128i e1 = _mm_srai_epi16(d1, 1);
284 const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
285 const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
286 const __m128i g0 = LOAD_16(best_y + 2 * i + 0);
287 const __m128i g1 = LOAD_16(best_y + 2 * i + 8);
288 const __m128i h0 = _mm_add_epi16(g0, f0);
289 const __m128i h1 = _mm_add_epi16(g1, f1);
290 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
291 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
292 STORE_16(out + 2 * i + 0, i0);
293 STORE_16(out + 2 * i + 8, i1);
294 }
295 for (; i < len; ++i) {
296 // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
297 // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
298 // We reuse the common sub-expressions.
299 const int a0b1 = A[i + 0] + B[i + 1];
300 const int a1b0 = A[i + 1] + B[i + 0];
301 const int a0a1b0b1 = a0b1 + a1b0 + 8;
302 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
303 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
304 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
305 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
306 }
307 }
308 #undef STORE_16
309 #undef LOAD_16
310
311 #elif defined(SJPEG_USE_NEON)
312
SharpUpdateY_NEON(const uint16_t * ref,const uint16_t * src,uint16_t * dst,int len)313 static uint64_t SharpUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
314 uint16_t* dst, int len) {
315 int i;
316 const int16x8_t zero = vdupq_n_s16(0);
317 const int16x8_t max = vdupq_n_s16(MAX_Y_T);
318 uint64x2_t sum = vdupq_n_u64(0);
319
320 for (i = 0; i + 8 <= len; i += 8) {
321 const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
322 const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
323 const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
324 const int16x8_t D = vsubq_s16(A, B); // diff_y
325 const int16x8_t F = vaddq_s16(C, D); // new_y
326 const uint16x8_t H =
327 vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
328 const int16x8_t I = vabsq_s16(D); // abs(diff_y)
329 vst1q_u16(dst + i, H);
330 sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
331 }
332 uint64_t diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
333 for (; i < len; ++i) {
334 const int diff_y = ref[i] - src[i];
335 const int new_y = static_cast<int>(dst[i]) + diff_y;
336 dst[i] = clip_y(new_y);
337 diff += static_cast<uint64_t>(abs(diff_y));
338 }
339 return diff;
340 }
341
SharpUpdateRGB_NEON(const int16_t * ref,const int16_t * src,int16_t * dst,int len)342 static void SharpUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
343 int16_t* dst, int len) {
344 int i;
345 for (i = 0; i + 8 <= len; i += 8) {
346 const int16x8_t A = vld1q_s16(ref + i);
347 const int16x8_t B = vld1q_s16(src + i);
348 const int16x8_t C = vld1q_s16(dst + i);
349 const int16x8_t D = vsubq_s16(A, B); // diff_uv
350 const int16x8_t E = vaddq_s16(C, D); // new_uv
351 vst1q_s16(dst + i, E);
352 }
353 for (; i < len; ++i) {
354 const int diff_uv = ref[i] - src[i];
355 dst[i] += diff_uv;
356 }
357 }
358
SharpFilterRow_NEON(const int16_t * A,const int16_t * B,int len,const uint16_t * best_y,uint16_t * out)359 static void SharpFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
360 const uint16_t* best_y, uint16_t* out) {
361 int i;
362 const int16x8_t max = vdupq_n_s16(MAX_Y_T);
363 const int16x8_t zero = vdupq_n_s16(0);
364 for (i = 0; i + 8 <= len; i += 8) {
365 const int16x8_t a0 = vld1q_s16(A + i + 0);
366 const int16x8_t a1 = vld1q_s16(A + i + 1);
367 const int16x8_t b0 = vld1q_s16(B + i + 0);
368 const int16x8_t b1 = vld1q_s16(B + i + 1);
369 const int16x8_t a0b1 = vaddq_s16(a0, b1);
370 const int16x8_t a1b0 = vaddq_s16(a1, b0);
371 const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
372 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
373 const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
374 const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
375 const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
376 const int16x8_t d0 = vaddq_s16(c1, a0);
377 const int16x8_t d1 = vaddq_s16(c0, a1);
378 const int16x8_t e0 = vrshrq_n_s16(d0, 1);
379 const int16x8_t e1 = vrshrq_n_s16(d1, 1);
380 const int16x8x2_t f = vzipq_s16(e0, e1);
381 const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
382 const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
383 const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
384 const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
385 const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
386 const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
387 vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
388 vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
389 }
390 for (; i < len; ++i) {
391 const int a0b1 = A[i + 0] + B[i + 1];
392 const int a1b0 = A[i + 1] + B[i + 0];
393 const int a0a1b0b1 = a0b1 + a1b0 + 8;
394 const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
395 const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
396 out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
397 out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
398 }
399 }
400
401 #endif // SJPEG_USE_NEON
402
403 static uint64_t (*kSharpUpdateY)(const uint16_t* src, const uint16_t* ref,
404 uint16_t* dst, int len);
405 static void (*kSharpUpdateRGB)(const int16_t* src, const int16_t* ref,
406 int16_t* dst, int len);
407 static void (*kSharpFilterRow)(const int16_t* A, const int16_t* B,
408 int len, const uint16_t* best_y, uint16_t* out);
409
InitFunctionPointers()410 static void InitFunctionPointers() {
411 static bool done = false;
412 if (!done) {
413 kSharpUpdateY = SharpUpdateY_C;
414 kSharpUpdateRGB = SharpUpdateRGB_C;
415 kSharpFilterRow = SharpFilterRow_C;
416 #if defined(SJPEG_USE_SSE2)
417 if (sjpeg::SupportsSSE2()) {
418 kSharpUpdateY = SharpUpdateY_SSE2;
419 kSharpUpdateRGB = SharpUpdateRGB_SSE2;
420 kSharpFilterRow = SharpFilterRow_SSE2;
421 }
422 #endif
423 #if defined(SJPEG_USE_NEON)
424 if (sjpeg::SupportsNEON()) {
425 kSharpUpdateY = SharpUpdateY_NEON;
426 kSharpUpdateRGB = SharpUpdateRGB_NEON;
427 kSharpFilterRow = SharpFilterRow_NEON;
428 }
429 #endif
430 done = true;
431 }
432 }
433
434 //------------------------------------------------------------------------------
435
RGBToGray(uint32_t r,uint32_t g,uint32_t b)436 static uint32_t RGBToGray(uint32_t r, uint32_t g, uint32_t b) {
437 const uint32_t luma = 13933 * r + 46871 * g + 4732 * b + (1u << YUV_FIX >> 1);
438 return (luma >> YUV_FIX);
439 }
440
ScaleDown(int a,int b,int c,int d)441 static uint32_t ScaleDown(int a, int b, int c, int d) {
442 const uint32_t A = GammaToLinear(a);
443 const uint32_t B = GammaToLinear(b);
444 const uint32_t C = GammaToLinear(c);
445 const uint32_t D = GammaToLinear(d);
446 return LinearToGamma((A + B + C + D + 2) >> 2);
447 }
448
UpdateChroma(const fixed_y_t * src1,const fixed_y_t * src2,fixed_t * dst,size_t uv_w)449 static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
450 fixed_t* dst, size_t uv_w) {
451 for (size_t i = 0; i < uv_w; ++i) {
452 const uint32_t r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
453 src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
454 const uint32_t g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
455 src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
456 const uint32_t b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
457 src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
458 const int W = RGBToGray(r, g, b);
459 dst[0 * uv_w] = (fixed_t)(r - W);
460 dst[1 * uv_w] = (fixed_t)(g - W);
461 dst[2 * uv_w] = (fixed_t)(b - W);
462 dst += 1;
463 src1 += 2;
464 src2 += 2;
465 }
466 }
467
UpdateW(const fixed_y_t * src,fixed_y_t * dst,int w)468 static void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
469 for (int i = 0; i < w; ++i) {
470 const uint32_t R = GammaToLinear(src[0 * w + i]);
471 const uint32_t G = GammaToLinear(src[1 * w + i]);
472 const uint32_t B = GammaToLinear(src[2 * w + i]);
473 const uint32_t Y = RGBToGray(R, G, B);
474 dst[i] = (fixed_y_t)LinearToGamma(Y);
475 }
476 }
477
StoreGray(const fixed_y_t * const rgb,fixed_y_t * const y,int w)478 static void StoreGray(const fixed_y_t* const rgb, fixed_y_t* const y, int w) {
479 for (int i = 0; i < w; ++i) {
480 y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
481 }
482 }
483
484 //------------------------------------------------------------------------------
485
Filter2(int A,int B,int W0)486 static fixed_y_t Filter2(int A, int B, int W0) {
487 const int v0 = (A * 3 + B + 2) >> 2;
488 return clip_y(v0 + W0);
489 }
490
491 //------------------------------------------------------------------------------
492
UpLift(uint8_t a)493 static fixed_y_t UpLift(uint8_t a) { // 8bit -> SFIX
494 return ((fixed_y_t)a << SFIX) | SHALF;
495 }
496
ImportOneRow(const uint8_t * const rgb,int pic_width,fixed_y_t * const dst)497 static void ImportOneRow(const uint8_t* const rgb, int pic_width,
498 fixed_y_t* const dst) {
499 const int w = (pic_width + 1) & ~1;
500 for (int i = 0; i < pic_width; ++i) {
501 const int off = i * 3;
502 dst[i + 0 * w] = UpLift(rgb[off + 0]);
503 dst[i + 1 * w] = UpLift(rgb[off + 1]);
504 dst[i + 2 * w] = UpLift(rgb[off + 2]);
505 }
506 if (pic_width & 1) { // replicate rightmost pixel
507 dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
508 dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
509 dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
510 }
511 }
512
InterpolateTwoRows(const fixed_y_t * const best_y,const fixed_t * prev_uv,const fixed_t * cur_uv,const fixed_t * next_uv,int w,fixed_y_t * out1,fixed_y_t * out2)513 static void InterpolateTwoRows(const fixed_y_t* const best_y,
514 const fixed_t* prev_uv,
515 const fixed_t* cur_uv,
516 const fixed_t* next_uv,
517 int w,
518 fixed_y_t* out1, fixed_y_t* out2) {
519 const int uv_w = w >> 1;
520 const int len = (w - 1) >> 1; // length to filter
521 for (int k = 3; k > 0; --k) { // process each R/G/B segments in turn
522 // special boundary case for i==0
523 out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
524 out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
525
526 kSharpFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
527 kSharpFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
528
529 // special boundary case for i == w - 1 when w is even
530 if (!(w & 1)) {
531 out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
532 best_y[w - 1 + 0]);
533 out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
534 best_y[w - 1 + w]);
535 }
536 out1 += w;
537 out2 += w;
538 prev_uv += uv_w;
539 cur_uv += uv_w;
540 next_uv += uv_w;
541 }
542 }
543
ConvertWRGBToYUV(const fixed_y_t * best_y,const fixed_t * best_uv,int width,int height,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)544 static void ConvertWRGBToYUV(const fixed_y_t* best_y,
545 const fixed_t* best_uv,
546 int width, int height,
547 uint8_t* y_plane,
548 uint8_t* u_plane, uint8_t* v_plane) {
549 const int w = (width + 1) & ~1;
550 const int h = (height + 1) & ~1;
551 const int uv_w = w >> 1;
552 const int uv_h = h >> 1;
553 for (int j = 0; j < height; ++j) {
554 const int off = (j >> 1) * 3 * uv_w;
555 for (int i = 0; i < width; ++i) {
556 const int W = best_y[i + j * w];
557 const int r = best_uv[off + (i >> 1) + 0 * uv_w] + W;
558 const int g = best_uv[off + (i >> 1) + 1 * uv_w] + W;
559 const int b = best_uv[off + (i >> 1) + 2 * uv_w] + W;
560 y_plane[i] = ConvertRGBToY(r, g, b);
561 }
562 y_plane += width;
563 }
564 for (int j = 0; j < uv_h; ++j) {
565 for (int i = 0; i < uv_w; ++i) {
566 const int off = i + j * 3 * uv_w;
567 const int r = best_uv[off + 0 * uv_w];
568 const int g = best_uv[off + 1 * uv_w];
569 const int b = best_uv[off + 2 * uv_w];
570 u_plane[i] = ConvertRGBToU(r, g, b);
571 v_plane[i] = ConvertRGBToV(r, g, b);
572 }
573 u_plane += uv_w;
574 v_plane += uv_w;
575 }
576 }
577
578 //------------------------------------------------------------------------------
579 // Main function
580
PreprocessARGB(const uint8_t * const rgb,int width,int height,size_t stride,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)581 static void PreprocessARGB(const uint8_t* const rgb,
582 int width, int height, size_t stride,
583 uint8_t* y_plane,
584 uint8_t* u_plane, uint8_t* v_plane) {
585 // we expand the right/bottom border if needed
586 const int w = (width + 1) & ~1;
587 const int h = (height + 1) & ~1;
588 const int uv_w = w >> 1;
589 const int uv_h = h >> 1;
590 uint64_t prev_diff_y_sum = ~0;
591
592 InitGammaTablesF();
593 InitFunctionPointers();
594
595 // TODO(skal): allocate one big memory chunk instead.
596 vector<fixed_y_t> tmp_buffer(w * 3 * 2);
597 vector<fixed_y_t> best_y(w * h);
598 vector<fixed_y_t> target_y(w * h);
599 vector<fixed_y_t> best_rgb_y(w * 2);
600 vector<fixed_t> best_uv(uv_w * 3 * uv_h);
601 vector<fixed_t> target_uv(uv_w * 3 * uv_h);
602 vector<fixed_t> best_rgb_uv(uv_w * 3 * 1);
603 const uint64_t diff_y_threshold = static_cast<uint64_t>(3.0 * w * h);
604
605 assert(width >= kMinDimensionIterativeConversion);
606 assert(height >= kMinDimensionIterativeConversion);
607
608 // Import RGB samples to W/RGB representation.
609 for (int j = 0; j < height; j += 2) {
610 const int is_last_row = (j == height - 1);
611 fixed_y_t* const src1 = &tmp_buffer[0 * w];
612 fixed_y_t* const src2 = &tmp_buffer[3 * w];
613 const int rgb_off = j * stride;
614 const int y_off = j * w;
615 const int uv_off = (j >> 1) * 3 * uv_w;
616
617 // prepare two rows of input
618 ImportOneRow(rgb + rgb_off, width, src1);
619 if (!is_last_row) {
620 ImportOneRow(rgb + rgb_off + stride, width, src2);
621 } else {
622 memcpy(src2, src1, 3 * w * sizeof(*src2));
623 }
624 StoreGray(src1, &best_y[y_off + 0], w);
625 StoreGray(src2, &best_y[y_off + w], w);
626 UpdateW(src1, &target_y[y_off + 0], w);
627 UpdateW(src2, &target_y[y_off + w], w);
628 UpdateChroma(src1, src2, &target_uv[uv_off], uv_w);
629 memcpy(&best_uv[uv_off], &target_uv[uv_off], 3 * uv_w * sizeof(best_uv[0]));
630 }
631
632 // Iterate and resolve clipping conflicts.
633 for (int iter = 0; iter < kNumIterations; ++iter) {
634 const fixed_t* cur_uv = &best_uv[0];
635 const fixed_t* prev_uv = &best_uv[0];
636 uint64_t diff_y_sum = 0;
637
638 for (int j = 0; j < h; j += 2) {
639 const int uv_off = (j >> 1) * 3 * uv_w;
640 fixed_y_t* const src1 = &tmp_buffer[0 * w];
641 fixed_y_t* const src2 = &tmp_buffer[3 * w];
642 const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
643 InterpolateTwoRows(&best_y[j * w], prev_uv, cur_uv, next_uv,
644 w, src1, src2);
645 prev_uv = cur_uv;
646 cur_uv = next_uv;
647
648 UpdateW(src1, &best_rgb_y[0 * w], w);
649 UpdateW(src2, &best_rgb_y[1 * w], w);
650 UpdateChroma(src1, src2, &best_rgb_uv[0], uv_w);
651
652 // update two rows of Y and one row of RGB
653 diff_y_sum += kSharpUpdateY(&target_y[j * w],
654 &best_rgb_y[0], &best_y[j * w], 2 * w);
655 kSharpUpdateRGB(&target_uv[uv_off],
656 &best_rgb_uv[0], &best_uv[uv_off], 3 * uv_w);
657 }
658 // test exit condition
659 if (iter > 0) {
660 if (diff_y_sum < diff_y_threshold) break;
661 if (diff_y_sum > prev_diff_y_sum) break;
662 }
663 prev_diff_y_sum = diff_y_sum;
664 }
665 // final reconstruction
666 ConvertWRGBToYUV(&best_y[0], &best_uv[0], width, height,
667 y_plane, u_plane, v_plane);
668 }
669
670 } // namespace sjpeg
671
672 ////////////////////////////////////////////////////////////////////////////////
673 // Entry point
674
ApplySharpYUVConversion(const uint8_t * const rgb,int W,int H,int stride,uint8_t * y_plane,uint8_t * u_plane,uint8_t * v_plane)675 void sjpeg::ApplySharpYUVConversion(const uint8_t* const rgb,
676 int W, int H, int stride,
677 uint8_t* y_plane,
678 uint8_t* u_plane, uint8_t* v_plane) {
679 if (W <= kMinDimensionIterativeConversion ||
680 H <= kMinDimensionIterativeConversion) {
681 const int uv_w = (W + 1) >> 1;
682 for (int y = 0; y < H; y += 2) {
683 const uint8_t* const rgb1 = rgb + y * stride;
684 const uint8_t* const rgb2 = (y < H - 1) ? rgb1 + stride : rgb1;
685 ConvertRowToY(rgb1, W, &y_plane[y * W]);
686 if (y < H - 1) {
687 ConvertRowToY(rgb2, W, &y_plane[(y + 1) * W]);
688 }
689 ConvertRowToUV(rgb1, rgb2, W,
690 &u_plane[(y >> 1) * uv_w],
691 &v_plane[(y >> 1) * uv_w]);
692 }
693 } else {
694 PreprocessARGB(rgb, W, H, stride, y_plane, u_plane, v_plane);
695 }
696 }
697
698 ////////////////////////////////////////////////////////////////////////////////
699