1 /*****************************************************************************
2  * This file is part of Kvazaar HEVC encoder.
3  *
4  * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without modification,
8  * are permitted provided that the following conditions are met:
9  *
10  * * Redistributions of source code must retain the above copyright notice, this
11  *   list of conditions and the following disclaimer.
12  *
13  * * Redistributions in binary form must reproduce the above copyright notice, this
14  *   list of conditions and the following disclaimer in the documentation and/or
15  *   other materials provided with the distribution.
16  *
17  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31  ****************************************************************************/
32 
33 #include "strategies/generic/picture-generic.h"
34 
35 #include <stdlib.h>
36 
37 #include "strategies/strategies-picture.h"
38 #include "strategyselector.h"
39 
40 // Function to clip int16_t to pixel. (0-255 or 0-1023)
41 // Assumes PIXEL_MAX to be 2^n-1
kvz_fast_clip_16bit_to_pixel(int16_t value)42 kvz_pixel kvz_fast_clip_16bit_to_pixel(int16_t value)
43 {
44   // Ensure that compiler generates arithmetic shift from ">>"
45 #if defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__)
46 
47   if (value & ~PIXEL_MAX) {
48     int16_t temp = (-value) >> 15;
49 #if KVZ_BIT_DEPTH == 10
50     temp &= PIXEL_MAX;
51 #endif
52     return temp;
53   }
54   else {
55     return value;
56   }
57 #else
58   return CLIP(PIXEL_MIN, PIXEL_MAX, value);
59 #endif
60 }
61 
62 // Function to clip int32_t to pixel. (0-255 or 0-1023)
63 // Assumes PIXEL_MAX to be 2^n-1
kvz_fast_clip_32bit_to_pixel(int32_t value)64 kvz_pixel kvz_fast_clip_32bit_to_pixel(int32_t value)
65 {
66   // Ensure that compiler generates arithmetic shift from ">>"
67 #if defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__)
68 
69   if (value & ~PIXEL_MAX) {
70     int32_t temp = (-value) >> 31;
71 #if KVZ_BIT_DEPTH == 10
72     temp &= PIXEL_MAX;
73 #endif
74     return temp;
75   }
76   else {
77     return value;
78   }
79 #else
80   return CLIP(PIXEL_MIN, PIXEL_MAX, value);
81 #endif
82 }
83 
84 /**
85  * \brief Calculate Sum of Absolute Differences (SAD)
86  *
87  * Calculate Sum of Absolute Differences (SAD) between two rectangular regions
88  * located in arbitrary points in the picture.
89  *
90  * \param data1   Starting point of the first picture.
91  * \param data2   Starting point of the second picture.
92  * \param width   Width of the region for which SAD is calculated.
93  * \param height  Height of the region for which SAD is calculated.
94  * \param stride  Width of the pixel array.
95  *
96  * \returns Sum of Absolute Differences
97  */
reg_sad_generic(const kvz_pixel * const data1,const kvz_pixel * const data2,const int width,const int height,const unsigned stride1,const unsigned stride2)98 static unsigned reg_sad_generic(const kvz_pixel * const data1, const kvz_pixel * const data2,
99                          const int width, const int height, const unsigned stride1, const unsigned stride2)
100 {
101   int y, x;
102   unsigned sad = 0;
103 
104   for (y = 0; y < height; ++y) {
105     for (x = 0; x < width; ++x) {
106       sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
107     }
108   }
109 
110   return sad;
111 }
112 
113 /**
114  * \brief  Transform differences between two 4x4 blocks.
115  * From HM 13.0
116  */
hadamard_4x4_generic(int32_t diff[4* 4])117 static int32_t hadamard_4x4_generic(int32_t diff[4*4])
118 {
119   int32_t m[4 * 4];
120   m[0] = diff[0] + diff[12];
121   m[1] = diff[1] + diff[13];
122   m[2] = diff[2] + diff[14];
123   m[3] = diff[3] + diff[15];
124   m[4] = diff[4] + diff[8];
125   m[5] = diff[5] + diff[9];
126   m[6] = diff[6] + diff[10];
127   m[7] = diff[7] + diff[11];
128   m[8] = diff[4] - diff[8];
129   m[9] = diff[5] - diff[9];
130   m[10] = diff[6] - diff[10];
131   m[11] = diff[7] - diff[11];
132   m[12] = diff[0] - diff[12];
133   m[13] = diff[1] - diff[13];
134   m[14] = diff[2] - diff[14];
135   m[15] = diff[3] - diff[15];
136 
137   int32_t d[4 * 4];
138   d[0] = m[0] + m[4];
139   d[1] = m[1] + m[5];
140   d[2] = m[2] + m[6];
141   d[3] = m[3] + m[7];
142   d[4] = m[8] + m[12];
143   d[5] = m[9] + m[13];
144   d[6] = m[10] + m[14];
145   d[7] = m[11] + m[15];
146   d[8] = m[0] - m[4];
147   d[9] = m[1] - m[5];
148   d[10] = m[2] - m[6];
149   d[11] = m[3] - m[7];
150   d[12] = m[12] - m[8];
151   d[13] = m[13] - m[9];
152   d[14] = m[14] - m[10];
153   d[15] = m[15] - m[11];
154 
155   m[0] = d[0] + d[3];
156   m[1] = d[1] + d[2];
157   m[2] = d[1] - d[2];
158   m[3] = d[0] - d[3];
159   m[4] = d[4] + d[7];
160   m[5] = d[5] + d[6];
161   m[6] = d[5] - d[6];
162   m[7] = d[4] - d[7];
163   m[8] = d[8] + d[11];
164   m[9] = d[9] + d[10];
165   m[10] = d[9] - d[10];
166   m[11] = d[8] - d[11];
167   m[12] = d[12] + d[15];
168   m[13] = d[13] + d[14];
169   m[14] = d[13] - d[14];
170   m[15] = d[12] - d[15];
171 
172   d[0] = m[0] + m[1];
173   d[1] = m[0] - m[1];
174   d[2] = m[2] + m[3];
175   d[3] = m[3] - m[2];
176   d[4] = m[4] + m[5];
177   d[5] = m[4] - m[5];
178   d[6] = m[6] + m[7];
179   d[7] = m[7] - m[6];
180   d[8] = m[8] + m[9];
181   d[9] = m[8] - m[9];
182   d[10] = m[10] + m[11];
183   d[11] = m[11] - m[10];
184   d[12] = m[12] + m[13];
185   d[13] = m[12] - m[13];
186   d[14] = m[14] + m[15];
187   d[15] = m[15] - m[14];
188 
189   int32_t satd = 0;
190   for (int i = 0; i < 16; i++) {
191     satd += abs(d[i]);
192   }
193   satd = ((satd + 1) >> 1);
194 
195   return satd;
196 }
197 
198 /**
199  * \brief  Calculate SATD between two 4x4 blocks.
200  */
satd_4x4_generic(const kvz_pixel * piOrg,const kvz_pixel * piCur)201 static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
202 {
203   int32_t diff[4 * 4];
204   for (int i = 0; i < 4 * 4; i++) {
205     diff[i] = piOrg[i] - piCur[i];
206   }
207   return hadamard_4x4_generic(diff);
208 }
209 
210 /**
211 * \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
212 */
kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,const int32_t stride1,const kvz_pixel * buf2,const int32_t stride2)213 unsigned kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,
214                                        const int32_t     stride1,
215                                        const kvz_pixel * buf2,
216                                        const int32_t     stride2)
217 {
218   int32_t diff[4 * 4];
219   for (int y = 0; y < 4; y++) {
220     for (int x = 0; x < 4; x++) {
221       diff[x + y * 4] = buf1[x + y * stride1] - buf2[x + y * stride2];
222     }
223   }
224   return hadamard_4x4_generic(diff);
225 }
226 
kvz_satd_4x4_subblock_quad_generic(const kvz_pixel * preds[4],const int stride,const kvz_pixel * orig,const int orig_stride,unsigned costs[4])227 void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds[4],
228                                        const int stride,
229                                        const kvz_pixel *orig,
230                                        const int orig_stride,
231                                        unsigned costs[4])
232 {
233   int32_t diff[4][4 * 4];
234   for (int y = 0; y < 4; y++) {
235     for (int x = 0; x < 4; x++) {
236       diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * stride];
237       diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * stride];
238       diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * stride];
239       diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * stride];
240     }
241   }
242 
243   costs[0] = hadamard_4x4_generic(diff[0]);
244   costs[1] = hadamard_4x4_generic(diff[1]);
245   costs[2] = hadamard_4x4_generic(diff[2]);
246   costs[3] = hadamard_4x4_generic(diff[3]);
247 }
248 
249 /**
250 * \brief  Calculate SATD between two 8x8 blocks inside bigger arrays.
251 */
satd_8x8_subblock_generic(const kvz_pixel * piOrg,const int32_t iStrideOrg,const kvz_pixel * piCur,const int32_t iStrideCur)252 static unsigned satd_8x8_subblock_generic(const kvz_pixel * piOrg, const int32_t iStrideOrg,
253   const kvz_pixel * piCur, const int32_t iStrideCur)
254 {
255   int32_t k, i, j, jj, sad = 0;
256   int32_t diff[64], m1[8][8], m2[8][8], m3[8][8];
257 
258   for (k = 0; k < 64; k += 8) {
259     diff[k + 0] = piOrg[0] - piCur[0];
260     diff[k + 1] = piOrg[1] - piCur[1];
261     diff[k + 2] = piOrg[2] - piCur[2];
262     diff[k + 3] = piOrg[3] - piCur[3];
263     diff[k + 4] = piOrg[4] - piCur[4];
264     diff[k + 5] = piOrg[5] - piCur[5];
265     diff[k + 6] = piOrg[6] - piCur[6];
266     diff[k + 7] = piOrg[7] - piCur[7];
267 
268     piCur += iStrideCur;
269     piOrg += iStrideOrg;
270   }
271 
272   // horizontal
273   for (j = 0; j < 8; ++j) {
274     jj = j << 3;
275     m2[j][0] = diff[jj] + diff[jj + 4];
276     m2[j][1] = diff[jj + 1] + diff[jj + 5];
277     m2[j][2] = diff[jj + 2] + diff[jj + 6];
278     m2[j][3] = diff[jj + 3] + diff[jj + 7];
279     m2[j][4] = diff[jj] - diff[jj + 4];
280     m2[j][5] = diff[jj + 1] - diff[jj + 5];
281     m2[j][6] = diff[jj + 2] - diff[jj + 6];
282     m2[j][7] = diff[jj + 3] - diff[jj + 7];
283 
284     m1[j][0] = m2[j][0] + m2[j][2];
285     m1[j][1] = m2[j][1] + m2[j][3];
286     m1[j][2] = m2[j][0] - m2[j][2];
287     m1[j][3] = m2[j][1] - m2[j][3];
288     m1[j][4] = m2[j][4] + m2[j][6];
289     m1[j][5] = m2[j][5] + m2[j][7];
290     m1[j][6] = m2[j][4] - m2[j][6];
291     m1[j][7] = m2[j][5] - m2[j][7];
292 
293     m2[j][0] = m1[j][0] + m1[j][1];
294     m2[j][1] = m1[j][0] - m1[j][1];
295     m2[j][2] = m1[j][2] + m1[j][3];
296     m2[j][3] = m1[j][2] - m1[j][3];
297     m2[j][4] = m1[j][4] + m1[j][5];
298     m2[j][5] = m1[j][4] - m1[j][5];
299     m2[j][6] = m1[j][6] + m1[j][7];
300     m2[j][7] = m1[j][6] - m1[j][7];
301   }
302 
303   // vertical
304   for (i = 0; i < 8; ++i) {
305     m3[0][i] = m2[0][i] + m2[4][i];
306     m3[1][i] = m2[1][i] + m2[5][i];
307     m3[2][i] = m2[2][i] + m2[6][i];
308     m3[3][i] = m2[3][i] + m2[7][i];
309     m3[4][i] = m2[0][i] - m2[4][i];
310     m3[5][i] = m2[1][i] - m2[5][i];
311     m3[6][i] = m2[2][i] - m2[6][i];
312     m3[7][i] = m2[3][i] - m2[7][i];
313 
314     m1[0][i] = m3[0][i] + m3[2][i];
315     m1[1][i] = m3[1][i] + m3[3][i];
316     m1[2][i] = m3[0][i] - m3[2][i];
317     m1[3][i] = m3[1][i] - m3[3][i];
318     m1[4][i] = m3[4][i] + m3[6][i];
319     m1[5][i] = m3[5][i] + m3[7][i];
320     m1[6][i] = m3[4][i] - m3[6][i];
321     m1[7][i] = m3[5][i] - m3[7][i];
322 
323     m2[0][i] = m1[0][i] + m1[1][i];
324     m2[1][i] = m1[0][i] - m1[1][i];
325     m2[2][i] = m1[2][i] + m1[3][i];
326     m2[3][i] = m1[2][i] - m1[3][i];
327     m2[4][i] = m1[4][i] + m1[5][i];
328     m2[5][i] = m1[4][i] - m1[5][i];
329     m2[6][i] = m1[6][i] + m1[7][i];
330     m2[7][i] = m1[6][i] - m1[7][i];
331   }
332 
333   for (i = 0; i < 64; ++i) {
334     sad += abs(((int*)m2)[i]);
335   }
336 
337   sad = (sad + 2) >> 2;
338 
339   return sad;
340 }
341 
satd_8x8_subblock_quad_generic(const kvz_pixel ** preds,const int stride,const kvz_pixel * orig,const int orig_stride,unsigned * costs)342 static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds,
343                                        const int stride,
344                                        const kvz_pixel *orig,
345                                        const int orig_stride,
346                                        unsigned *costs)
347 {
348   costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], stride);
349   costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], stride);
350   costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], stride);
351   costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], stride);
352 }
353 
354 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
355 SATD_NxN(generic,  8)
356 SATD_NxN(generic, 16)
357 SATD_NxN(generic, 32)
358 SATD_NxN(generic, 64)
359 SATD_ANY_SIZE(generic)
360 
361 
362 // Declare these functions to make sure the signature of the macro matches.
363 static cost_pixel_nxn_multi_func satd_4x4_dual_generic;
364 static cost_pixel_nxn_multi_func satd_8x8_dual_generic;
365 static cost_pixel_nxn_multi_func satd_16x16_dual_generic;
366 static cost_pixel_nxn_multi_func satd_32x32_dual_generic;
367 static cost_pixel_nxn_multi_func satd_64x64_dual_generic;
368 
369 #define SATD_DUAL_NXN(n, pixel_type) \
370 static void satd_ ## n ## x ## n ## _dual_generic( \
371   const pred_buffer preds, const pixel_type * const orig, unsigned num_modes, unsigned *costs_out) \
372 { \
373   unsigned x, y; \
374   unsigned sum = 0; \
375   for (y = 0; y < (n); y += 8) { \
376   unsigned row = y * (n); \
377   for (x = 0; x < (n); x += 8) { \
378   sum += satd_8x8_subblock_generic(&preds[0][row + x], (n), &orig[row + x], (n)); \
379   } \
380   } \
381   costs_out[0] = sum>>(KVZ_BIT_DEPTH-8); \
382   \
383   sum = 0; \
384   for (y = 0; y < (n); y += 8) { \
385   unsigned row = y * (n); \
386   for (x = 0; x < (n); x += 8) { \
387   sum += satd_8x8_subblock_generic(&preds[1][row + x], (n), &orig[row + x], (n)); \
388   } \
389   } \
390   costs_out[1] = sum>>(KVZ_BIT_DEPTH-8); \
391 }
392 
satd_4x4_dual_generic(const pred_buffer preds,const kvz_pixel * const orig,unsigned num_modes,unsigned * costs_out)393 static void satd_4x4_dual_generic(const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *costs_out)
394 {
395   costs_out[0] = satd_4x4_generic(orig, preds[0]);
396   costs_out[1] = satd_4x4_generic(orig, preds[1]);
397 }
398 
399 SATD_DUAL_NXN(8, kvz_pixel)
400 SATD_DUAL_NXN(16, kvz_pixel)
401 SATD_DUAL_NXN(32, kvz_pixel)
402 SATD_DUAL_NXN(64, kvz_pixel)
403 
404 #define SATD_ANY_SIZE_MULTI_GENERIC(suffix, num_parallel_blocks) \
405   static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
406   static void satd_any_size_ ## suffix ( \
407       int width, int height, \
408       const kvz_pixel **preds, \
409       const int stride, \
410       const kvz_pixel *orig, \
411       const int orig_stride, \
412       unsigned num_modes, \
413       unsigned *costs_out, \
414       int8_t *valid) \
415   { \
416     unsigned sums[num_parallel_blocks] = { 0 }; \
417     const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
418     const kvz_pixel *orig_ptr = orig; \
419     costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
420     if (width % 8 != 0) { \
421       /* Process the first column using 4x4 blocks. */ \
422       for (int y = 0; y < height; y += 4) { \
423         kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
424             } \
425       orig_ptr += 4; \
426       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
427         pred_ptrs[blk] += 4; \
428             }\
429       width -= 4; \
430             } \
431     if (height % 8 != 0) { \
432       /* Process the first row using 4x4 blocks. */ \
433       for (int x = 0; x < width; x += 4 ) { \
434         kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
435             } \
436       orig_ptr += 4 * orig_stride; \
437       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
438         pred_ptrs[blk] += 4 * stride; \
439             }\
440       height -= 4; \
441         } \
442     /* The rest can now be processed with 8x8 blocks. */ \
443     for (int y = 0; y < height; y += 8) { \
444       orig_ptr = &orig[y * orig_stride]; \
445       pred_ptrs[0] = &preds[0][y * stride]; \
446       pred_ptrs[1] = &preds[1][y * stride]; \
447       pred_ptrs[2] = &preds[2][y * stride]; \
448       pred_ptrs[3] = &preds[3][y * stride]; \
449       for (int x = 0; x < width; x += 8) { \
450         satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
451         orig_ptr += 8; \
452         pred_ptrs[0] += 8; \
453         pred_ptrs[1] += 8; \
454         pred_ptrs[2] += 8; \
455         pred_ptrs[3] += 8; \
456         costs_out[0] += sums[0]; \
457         costs_out[1] += sums[1]; \
458         costs_out[2] += sums[2]; \
459         costs_out[3] += sums[3]; \
460       } \
461     } \
462     for(int i = 0; i < num_parallel_blocks; ++i){\
463       costs_out[i] = costs_out[i] >> (KVZ_BIT_DEPTH - 8);\
464     } \
465     return; \
466   }
467 
468 SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
469 
470 // Function macro for defining SAD calculating functions
471 // for fixed size blocks.
472 #define SAD_NXN(n, pixel_type) \
473 static unsigned sad_ ##  n ## x ## n ## _generic( \
474   const pixel_type * const block1, const pixel_type * const block2) \
475 { \
476   unsigned i; \
477   unsigned sum = 0; \
478   for (i = 0; i < (n)*(n); ++i) { \
479   sum += abs(block1[i] - block2[i]); \
480   } \
481   return sum>>(KVZ_BIT_DEPTH-8); \
482 }
483 
484 // Declare these functions to make sure the signature of the macro matches.
485 static cost_pixel_nxn_func sad_4x4_generic;
486 static cost_pixel_nxn_func sad_8x8_generic;
487 static cost_pixel_nxn_func sad_16x16_generic;
488 static cost_pixel_nxn_func sad_32x32_generic;
489 static cost_pixel_nxn_func sad_64x64_generic;
490 
491 // These macros define sad_16bit_nxn functions for n = 4, 8, 16, 32, 64
492 // with function signatures of cost_16bit_nxn_func.
493 // They are used through get_pixel_sad_func.
494 SAD_NXN(4, kvz_pixel)
495 SAD_NXN(8, kvz_pixel)
496 SAD_NXN(16, kvz_pixel)
497 SAD_NXN(32, kvz_pixel)
498 SAD_NXN(64, kvz_pixel)
499 
500 // Declare these functions to make sure the signature of the macro matches.
501 static cost_pixel_nxn_multi_func sad_4x4_dual_generic;
502 static cost_pixel_nxn_multi_func sad_8x8_dual_generic;
503 static cost_pixel_nxn_multi_func sad_16x16_dual_generic;
504 static cost_pixel_nxn_multi_func sad_32x32_dual_generic;
505 static cost_pixel_nxn_multi_func sad_64x64_dual_generic;
506 
507 // Function macro for defining SAD calculating functions
508 // for fixed size blocks.
509 #define SAD_DUAL_NXN(n, pixel_type) \
510 static void sad_ ##  n ## x ## n ## _dual_generic( \
511   const pred_buffer preds, const pixel_type * const orig, unsigned num_modes, unsigned *costs_out) \
512 { \
513   unsigned i; \
514   unsigned sum = 0; \
515   for (i = 0; i < (n)*(n); ++i) { \
516   sum += abs(preds[0][i] - orig[i]); \
517   } \
518   costs_out[0] = sum>>(KVZ_BIT_DEPTH-8); \
519   \
520   sum = 0; \
521   for (i = 0; i < (n)*(n); ++i) { \
522   sum += abs(preds[1][i] - orig[i]); \
523   } \
524   costs_out[1] = sum>>(KVZ_BIT_DEPTH-8); \
525 }
526 
527 SAD_DUAL_NXN(4, kvz_pixel)
528 SAD_DUAL_NXN(8, kvz_pixel)
529 SAD_DUAL_NXN(16, kvz_pixel)
530 SAD_DUAL_NXN(32, kvz_pixel)
531 SAD_DUAL_NXN(64, kvz_pixel)
532 
pixels_calc_ssd_generic(const kvz_pixel * const ref,const kvz_pixel * const rec,const int ref_stride,const int rec_stride,const int width)533 static unsigned pixels_calc_ssd_generic(const kvz_pixel *const ref, const kvz_pixel *const rec,
534                  const int ref_stride, const int rec_stride,
535                  const int width)
536 {
537   int ssd = 0;
538   int y, x;
539 
540   for (y = 0; y < width; ++y) {
541     for (x = 0; x < width; ++x) {
542       int diff = ref[x + y * ref_stride] - rec[x + y * rec_stride];
543       ssd += diff * diff;
544     }
545   }
546 
547   return ssd >> (2*(KVZ_BIT_DEPTH-8));
548 }
549 
inter_recon_bipred_generic(const int hi_prec_luma_rec0,const int hi_prec_luma_rec1,const int hi_prec_chroma_rec0,const int hi_prec_chroma_rec1,int32_t height,int32_t width,int32_t ypos,int32_t xpos,const hi_prec_buf_t * high_precision_rec0,const hi_prec_buf_t * high_precision_rec1,lcu_t * lcu,kvz_pixel * temp_lcu_y,kvz_pixel * temp_lcu_u,kvz_pixel * temp_lcu_v,bool predict_luma,bool predict_chroma)550 static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
551   const int hi_prec_luma_rec1,
552   const int hi_prec_chroma_rec0,
553   const int hi_prec_chroma_rec1,
554   int32_t height,
555   int32_t width,
556   int32_t ypos,
557   int32_t xpos,
558   const hi_prec_buf_t*high_precision_rec0,
559   const hi_prec_buf_t*high_precision_rec1,
560   lcu_t* lcu,
561   kvz_pixel* temp_lcu_y,
562   kvz_pixel* temp_lcu_u,
563   kvz_pixel* temp_lcu_v,
564   bool predict_luma,
565   bool predict_chroma) {
566 
567   int shift = 15 - KVZ_BIT_DEPTH;
568   int offset = 1 << (shift - 1);
569 
570   int y_in_lcu;
571   int x_in_lcu;
572 
573   //After reconstruction, merge the predictors by taking an average of each pixel
574   for (int temp_y = 0; temp_y < height; ++temp_y) {
575 
576 
577     for (int temp_x = 0; temp_x < width; ++temp_x) {
578       y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
579       x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
580 
581       if (predict_luma) {
582         int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
583         int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
584 
585         lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
586       }
587 
588       if (predict_chroma && (temp_x < width >> 1 && temp_y < height >> 1)) {
589 
590         y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
591         x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
592 
593         int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
594         int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
595         lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
596 
597         int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
598         int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH)));
599         lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
600       }
601     }
602   }
603 
604 }
605 
606 
get_optimized_sad_generic(int32_t width)607 static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
608 {
609   return NULL;
610 }
611 
612 /**
613  * \brief Vertically interpolate SAD outside the frame.
614  *
615  * \param data1   Starting point of the first picture.
616  * \param data2   Starting point of the second picture.
617  * \param width   Width of the region for which SAD is calculated.
618  * \param height  Height of the region for which SAD is calculated.
619  * \param width  Width of the pixel array.
620  *
621  * \returns Sum of Absolute Differences
622  */
ver_sad_generic(const kvz_pixel * pic_data,const kvz_pixel * ref_data,int block_width,int block_height,unsigned pic_stride)623 static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
624                                 int block_width, int block_height, unsigned pic_stride)
625 {
626   int x, y;
627   unsigned sad = 0;
628 
629   for (y = 0; y < block_height; ++y) {
630     for (x = 0; x < block_width; ++x) {
631       sad += abs(pic_data[y * pic_stride + x] - ref_data[x]);
632     }
633   }
634 
635   return sad;
636 }
637 
638 /**
639  * \brief Horizontally interpolate SAD outside the frame.
640  *
641  * \param data1   Starting point of the first picture.
642  * \param data2   Starting point of the second picture.
643  * \param width   Width of the region for which SAD is calculated.
644  * \param height  Height of the region for which SAD is calculated.
645  * \param width   Width of the pixel array.
646  *
647  * \returns Sum of Absolute Differences
648  */
hor_sad(const kvz_pixel * pic_data,const kvz_pixel * ref_data,int block_width,int block_height,unsigned pic_stride,unsigned ref_stride)649 static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
650                         int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
651 {
652   int x, y;
653   unsigned sad = 0;
654 
655   for (y = 0; y < block_height; ++y) {
656     for (x = 0; x < block_width; ++x) {
657       sad += abs(pic_data[y * pic_stride + x] - ref_data[y * ref_stride]);
658     }
659   }
660 
661   return sad;
662 }
663 
664 
hor_sad_generic(const kvz_pixel * pic_data,const kvz_pixel * ref_data,int32_t width,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)665 static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
666                                 int32_t width, int32_t height, uint32_t pic_stride,
667                                 uint32_t ref_stride, uint32_t left, uint32_t right)
668 {
669   uint32_t result = 0;
670   if (left) {
671     result += hor_sad    (pic_data, ref_data + left, left,
672                           height, pic_stride, ref_stride);
673 
674     result += kvz_reg_sad(pic_data + left, ref_data + left, width - left,
675                           height, pic_stride, ref_stride);
676   } else if (right) {
677     result += kvz_reg_sad(pic_data, ref_data, width - right,
678                           height, pic_stride, ref_stride);
679 
680     result += hor_sad    (pic_data + width - right,
681                           ref_data + width - right - 1,
682                           right, height, pic_stride, ref_stride);
683   } else {
684     result += kvz_reg_sad(pic_data, ref_data, width,
685                           height, pic_stride, ref_stride);
686   }
687   return result;
688 }
689 
690 // Calculate pixel value variance. Takes in arrays of kvz_pixel
pixel_var_generic(const kvz_pixel * arr,const uint32_t len)691 static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
692 {
693   double var = 0;
694   double arr_mean = 0;
695 
696   // Calculate array mean
697   int i = 0;
698   double sum = 0;
699 
700   for (; i < len; ++i) {
701     sum += arr[i];
702   }
703   arr_mean = sum / (double)len;
704 
705   // Calculate array variance
706   for (i = 0; i < len; ++i) {
707     double tmp = (double)arr[i] - arr_mean;
708     var += tmp*tmp;
709   }
710 
711   var /= len;
712 
713   return var;
714 }
715 
kvz_strategy_register_picture_generic(void * opaque,uint8_t bitdepth)716 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
717 {
718   bool success = true;
719 
720   success &= kvz_strategyselector_register(opaque, "reg_sad", "generic", 0, &reg_sad_generic);
721 
722   success &= kvz_strategyselector_register(opaque, "sad_4x4", "generic", 0, &sad_4x4_generic);
723   success &= kvz_strategyselector_register(opaque, "sad_8x8", "generic", 0, &sad_8x8_generic);
724   success &= kvz_strategyselector_register(opaque, "sad_16x16", "generic", 0, &sad_16x16_generic);
725   success &= kvz_strategyselector_register(opaque, "sad_32x32", "generic", 0, &sad_32x32_generic);
726   success &= kvz_strategyselector_register(opaque, "sad_64x64", "generic", 0, &sad_64x64_generic);
727 
728   success &= kvz_strategyselector_register(opaque, "satd_4x4", "generic", 0, &satd_4x4_generic);
729   success &= kvz_strategyselector_register(opaque, "satd_8x8", "generic", 0, &satd_8x8_generic);
730   success &= kvz_strategyselector_register(opaque, "satd_16x16", "generic", 0, &satd_16x16_generic);
731   success &= kvz_strategyselector_register(opaque, "satd_32x32", "generic", 0, &satd_32x32_generic);
732   success &= kvz_strategyselector_register(opaque, "satd_64x64", "generic", 0, &satd_64x64_generic);
733 
734   success &= kvz_strategyselector_register(opaque, "sad_4x4_dual", "generic", 0, &sad_4x4_dual_generic);
735   success &= kvz_strategyselector_register(opaque, "sad_8x8_dual", "generic", 0, &sad_8x8_dual_generic);
736   success &= kvz_strategyselector_register(opaque, "sad_16x16_dual", "generic", 0, &sad_16x16_dual_generic);
737   success &= kvz_strategyselector_register(opaque, "sad_32x32_dual", "generic", 0, &sad_32x32_dual_generic);
738   success &= kvz_strategyselector_register(opaque, "sad_64x64_dual", "generic", 0, &sad_64x64_dual_generic);
739 
740   success &= kvz_strategyselector_register(opaque, "satd_4x4_dual", "generic", 0, &satd_4x4_dual_generic);
741   success &= kvz_strategyselector_register(opaque, "satd_8x8_dual", "generic", 0, &satd_8x8_dual_generic);
742   success &= kvz_strategyselector_register(opaque, "satd_16x16_dual", "generic", 0, &satd_16x16_dual_generic);
743   success &= kvz_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
744   success &= kvz_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
745   success &= kvz_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
746   success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
747 
748   success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
749   success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);
750 
751   success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
752   success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
753   success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
754 
755   success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
756 
757   return success;
758 }
759