1 /*****************************************************************************
2  * This file is part of Kvazaar HEVC encoder.
3  *
4  * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without modification,
8  * are permitted provided that the following conditions are met:
9  *
10  * * Redistributions of source code must retain the above copyright notice, this
11  *   list of conditions and the following disclaimer.
12  *
13  * * Redistributions in binary form must reproduce the above copyright notice, this
14  *   list of conditions and the following disclaimer in the documentation and/or
15  *   other materials provided with the distribution.
16  *
17  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31  ****************************************************************************/
32 
33 #include "strategies/generic/ipol-generic.h"
34 
35 #include <stdio.h>
36 #include <string.h>
37 
38 #include "encoder.h"
39 #include "strategies/generic/picture-generic.h"
40 #include "strategies/strategies-ipol.h"
41 #include "strategyselector.h"
42 
43 extern int8_t kvz_g_luma_filter[4][8];
44 extern int8_t kvz_g_chroma_filter[8][4];
45 
kvz_eight_tap_filter_hor_generic(int8_t * filter,kvz_pixel * data)46 int32_t kvz_eight_tap_filter_hor_generic(int8_t *filter, kvz_pixel *data)
47 {
48   int32_t temp = 0;
49   for (int i = 0; i < 8; ++i)
50   {
51     temp += filter[i] * data[i];
52   }
53 
54   return temp;
55 }
56 
kvz_eight_tap_filter_hor_16bit_generic(int8_t * filter,int16_t * data)57 int32_t kvz_eight_tap_filter_hor_16bit_generic(int8_t *filter, int16_t *data)
58 {
59   int32_t temp = 0;
60   for (int i = 0; i < 8; ++i)
61   {
62     temp += filter[i] * data[i];
63   }
64 
65   return temp;
66 }
67 
kvz_eight_tap_filter_ver_generic(int8_t * filter,kvz_pixel * data,int16_t stride)68 int32_t kvz_eight_tap_filter_ver_generic(int8_t *filter, kvz_pixel *data, int16_t stride)
69 {
70   int32_t temp = 0;
71   for (int i = 0; i < 8; ++i)
72   {
73     temp += filter[i] * data[stride * i];
74   }
75 
76   return temp;
77 }
78 
kvz_eight_tap_filter_ver_16bit_generic(int8_t * filter,int16_t * data,int16_t stride)79 int32_t kvz_eight_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t stride)
80 {
81   int32_t temp = 0;
82   for (int i = 0; i < 8; ++i)
83   {
84     temp += filter[i] * data[stride * i];
85   }
86 
87   return temp;
88 }
89 
kvz_four_tap_filter_hor_generic(int8_t * filter,kvz_pixel * data)90 int32_t kvz_four_tap_filter_hor_generic(int8_t *filter, kvz_pixel *data)
91 {
92   int32_t temp = 0;
93   for (int i = 0; i < 4; ++i)
94   {
95     temp += filter[i] * data[i];
96   }
97 
98   return temp;
99 }
100 
kvz_four_tap_filter_hor_16bit_generic(int8_t * filter,int16_t * data)101 int32_t kvz_four_tap_filter_hor_16bit_generic(int8_t *filter, int16_t *data)
102 {
103   int32_t temp = 0;
104   for (int i = 0; i < 4; ++i)
105   {
106     temp += filter[i] * data[i];
107   }
108 
109   return temp;
110 }
111 
kvz_four_tap_filter_ver_generic(int8_t * filter,kvz_pixel * data,int16_t stride)112 int32_t kvz_four_tap_filter_ver_generic(int8_t *filter, kvz_pixel *data, int16_t stride)
113 {
114   int32_t temp = 0;
115   for (int i = 0; i < 4; ++i)
116   {
117     temp += filter[i] * data[stride * i];
118   }
119 
120   return temp;
121 }
122 
kvz_four_tap_filter_ver_16bit_generic(int8_t * filter,int16_t * data,int16_t stride)123 int32_t kvz_four_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t stride)
124 {
125   int32_t temp = 0;
126   for (int i = 0; i < 4; ++i)
127   {
128     temp += filter[i] * data[stride * i];
129   }
130 
131   return temp;
132 }
133 
kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])134 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
135 {
136   //TODO: horizontal and vertical only filtering
137   int32_t x, y;
138 
139   // Interpolation filter shifts
140   int16_t shift1 = KVZ_BIT_DEPTH - 8;
141   int32_t shift2 = 6;
142 
143   // Weighted prediction offset and shift
144   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
145   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
146 
147   // Select filters according to the fractional part of the x and y mv components
148   int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
149   int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
150 
151   int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
152   int16_t hor_stride = LCU_WIDTH;
153 
154   // Filter horizontally
155   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
156     for (x = 0; x < width; ++x) {
157       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
158       int xpos = x - KVZ_LUMA_FILTER_OFFSET;
159       hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
160     }
161   }
162 
163   // Filter vertically
164   for (y = 0; y < height; ++y) {
165     for (x = 0; x < width; ++x) {
166       dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
167     }
168   }
169 }
170 
kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,int16_t * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])171 void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
172 {
173   //TODO: horizontal and vertical only filtering
174   int32_t x, y;
175 
176   // Interpolation filter shifts
177   int16_t shift1 = KVZ_BIT_DEPTH - 8;
178   int32_t shift2 = 6;
179 
180   // Select filters according to the fractional part of the x and y mv components
181   int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
182   int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
183 
184   int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
185   int16_t hor_stride = LCU_WIDTH;
186 
187   // Filter horizontally
188   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
189     for (x = 0; x < width; ++x) {
190       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
191       int xpos = x - KVZ_LUMA_FILTER_OFFSET;
192       hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
193     }
194   }
195 
196   // Filter vertically
197   for (y = 0; y < height; ++y) {
198     for (x = 0; x < width; ++x) {
199       dst[y * dst_stride + x] = kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2;
200     }
201   }
202 }
203 
kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)204 void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,
205   kvz_pixel *src,
206   int16_t src_stride,
207   int width,
208   int height,
209   kvz_pixel filtered[4][LCU_LUMA_SIZE],
210   int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
211   int8_t fme_level,
212   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
213   int8_t hpel_off_x, int8_t hpel_off_y)
214 {
215   int x, y, first_y;
216 
217   // Interpolation filter shifts
218   int16_t shift1 = KVZ_BIT_DEPTH - 8;
219 
220   // Weighted prediction offset and shift
221   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
222   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
223 
224   int8_t *fir0 = kvz_g_luma_filter[0];
225   int8_t *fir2 = kvz_g_luma_filter[2];
226 
227   int16_t dst_stride = LCU_WIDTH;
228   int16_t hor_stride = LCU_WIDTH;
229   int32_t first_row_offset = (KVZ_LUMA_FILTER_OFFSET + 1) * hor_stride;
230 
231   int16_t *col_pos0 = hor_first_cols[0];
232   int16_t *col_pos2 = hor_first_cols[2];
233 
234   // Horizontally filtered samples from the top row are
235   // not needed unless samples for diagonal positions are filtered later.
236   first_y = fme_level > 1 ? 0 : 1;
237 
238   // HORIZONTAL STEP
239   // Integer pixels
240   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
241     for (x = 0; x < width; ++x) {
242       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
243       int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
244       hor_intermediate[0][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
245     }
246   }
247 
248   // Write the first column in contiguous memory
249   x = 0;
250   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
251     int ypos = y - KVZ_LUMA_FILTER_OFFSET;
252     int xpos = x - KVZ_LUMA_FILTER_OFFSET;
253     col_pos0[y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
254   }
255 
256   // Half pixels
257   for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
258     for (x = 0; x < width; ++x) {
259       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
260       int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
261       hor_intermediate[1][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
262     }
263   }
264 
265   // Write the first column in contiguous memory
266   x = 0;
267   for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
268     int ypos = y - KVZ_LUMA_FILTER_OFFSET;
269     int xpos = x - KVZ_LUMA_FILTER_OFFSET;
270     col_pos2[y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
271   }
272 
273   // VERTICAL STEP
274 
275   // Right
276   // Only horizontal filter
277   for (y = 0; y < height; ++y) {
278     for (x = 0; x < width; ++x) {
279       filtered[1][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((hor_intermediate[1][first_row_offset + y * hor_stride + x] + wp_offset1) >> wp_shift1);
280     }
281   }
282 
283   // Left
284   // Copy from the right filtered block and the extra column
285   for (y = 0; y < height; ++y) {
286     x = 0;
287     filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1);
288     for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
289   }
290 
291   // Top
292   // Only vertical filter
293   for (y = 0; y < height; ++y) {
294     int ypos = y - KVZ_LUMA_FILTER_OFFSET;
295     for (x = 0; x < width; ++x) {
296       int xpos = x;
297       int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*ypos + xpos + 1], src_stride) >> shift1;
298       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
299       filtered[2][y * dst_stride + x] = sample;
300     }
301   }
302 
303   // Bottom
304   // Copy what can be copied from the top filtered values.
305   // Then filter the last row from horizontal intermediate buffer.
306   for (y = 0; y < height - 1; ++y) {
307     for (x = 0; x < width; ++x) filtered[3][y * dst_stride + x] = filtered[2][(y + 1) * dst_stride + x];
308   }
309 
310   int ypos = y - KVZ_LUMA_FILTER_OFFSET;
311   for (x = 0; x < width; ++x) {
312     int xpos = x;
313     int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*(ypos + 1) + xpos + 1], src_stride) >> shift1;
314     sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
315     filtered[3][y * dst_stride + x] = sample;
316   }
317 }
318 
kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)319 void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
320   kvz_pixel *src,
321   int16_t src_stride,
322   int width,
323   int height,
324   kvz_pixel filtered[4][LCU_LUMA_SIZE],
325   int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
326   int8_t fme_level,
327   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
328   int8_t hpel_off_x, int8_t hpel_off_y)
329 {
330   int x, y;
331 
332   // Interpolation filter shifts
333   int32_t shift2 = 6;
334 
335   // Weighted prediction offset and shift
336   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
337   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
338 
339   int8_t *fir2 = kvz_g_luma_filter[2];
340 
341   int16_t dst_stride = LCU_WIDTH;
342   int16_t hor_stride = LCU_WIDTH;
343 
344   // Horizontal positions
345   int16_t *col_pos2 = hor_first_cols[2];
346 
347   // VERTICAL STEP
348 
349   // Top-right
350   for (y = 0; y < height; ++y) {
351     for (x = 0; x < width; ++x) {
352       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][y * hor_stride + x], hor_stride) >> shift2;
353       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
354       filtered[1][y * dst_stride + x] = sample;
355     }
356   }
357 
358   for (y = 0; y < height; ++y) {
359     x = 0;
360     filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1);
361     for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
362   }
363 
364   // Top-left
365   // Copy what can be copied from top-right filtered values. Filter the first column from the column array.
366   for (y = 0; y < height; ++y) {
367     x = 0;
368     int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[y]) >> shift2;
369     sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
370     filtered[0][y * dst_stride + x] = sample;
371     for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
372   }
373 
374   // Bottom-right
375   // Copy what can be copied from top-right filtered values. Filter the last row.
376   for (y = 0; y < height - 1; ++y) {
377     for (x = 0; x < width; ++x) filtered[3][y* dst_stride + x] = filtered[1][(y + 1) * dst_stride + x];
378   }
379 
380   for (x = 0; x < width; ++x) {
381     int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][(y + 1) * hor_stride + x], hor_stride) >> shift2;
382     sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
383     filtered[3][y * dst_stride + x] = sample;
384   }
385 
386   // Bottom-left
387   // Copy what can be copied from the top-left filtered values.
388   // Copy what can be copied from the bottom-right filtered values.
389   // Finally filter the last pixel from the column array.
390   for (y = 0; y < height - 1; ++y) {
391     for (x = 0; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[0][(y + 1) * dst_stride + x];
392   }
393   for (x = 1; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[3][y * dst_stride + x - 1];
394   x = 0;
395   int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[(y + 1)]) >> shift2;
396   sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
397   filtered[2][y * dst_stride + x] = sample;
398 }
399 
kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)400 void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,
401   kvz_pixel *src,
402   int16_t src_stride,
403   int width,
404   int height,
405   kvz_pixel filtered[4][LCU_LUMA_SIZE],
406   int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
407   int8_t fme_level,
408   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
409   int8_t hpel_off_x, int8_t hpel_off_y)
410 {
411   int x, y;
412 
413   // Interpolation filter shifts
414   int16_t shift1 = KVZ_BIT_DEPTH - 8;
415   int32_t shift2 = 6;
416 
417   // Weighted prediction offset and shift
418   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
419   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
420 
421   int8_t *fir0 = kvz_g_luma_filter[0];
422   int8_t *fir2 = kvz_g_luma_filter[2];
423   int8_t *fir1 = kvz_g_luma_filter[1];
424   int8_t *fir3 = kvz_g_luma_filter[3];
425 
426   // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered.
427   int16_t *hor_pos0 = hor_intermediate[0];
428   int16_t *hor_pos2 = hor_intermediate[1];
429   int16_t *hor_pos_l = hor_intermediate[3];
430   int16_t *hor_pos_r = hor_intermediate[4];
431   int8_t *hor_fir_l  = hpel_off_x != 0 ? fir1 : fir3;
432   int8_t *hor_fir_r  = hpel_off_x != 0 ? fir3 : fir1;
433   int16_t *col_pos_l = hor_first_cols[1];
434   int16_t *col_pos_r = hor_first_cols[3];
435 
436   int16_t dst_stride = LCU_WIDTH;
437   int16_t hor_stride = LCU_WIDTH;
438 
439   int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0;
440   int16_t *col_pos_hor  = hpel_off_x != 0 ? hor_first_cols[2] : hor_first_cols[0];
441 
442   // Specify if integer pixels are filtered from left or/and top integer samples
443   int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
444   int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
445   int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
446   int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
447 
448   // HORIZONTAL STEP
449   // Left QPEL
450   int sample_off_y = hpel_off_y < 0 ? 0 : 1;
451   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
452     for (x = 0; x < width; ++x) {
453       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
454       int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
455       hor_pos_l[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1;
456     }
457   }
458 
459   // Write the first column in contiguous memory
460   x = 0;
461   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
462     int ypos = y - KVZ_LUMA_FILTER_OFFSET;
463     int xpos = x - KVZ_LUMA_FILTER_OFFSET;
464     col_pos_l[y] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1;
465   }
466 
467   // Right QPEL
468   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
469     for (x = 0; x < width; ++x) {
470       int ypos = y - KVZ_LUMA_FILTER_OFFSET;
471       int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
472       hor_pos_r[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1;
473     }
474   }
475 
476   // Write the first column in contiguous memory
477   x = 0;
478   for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
479     int ypos = y - KVZ_LUMA_FILTER_OFFSET;
480     int xpos = x - KVZ_LUMA_FILTER_OFFSET;
481     col_pos_r[y] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1;
482   }
483 
484   // VERTICAL STEP
485   int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
486   int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
487   int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
488   int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
489 
490   // Left QPEL (1/4 or 3/4 x positions)
491   for (y = 0; y < height; ++y) {
492     if (!off_x_fir_l) {
493       x = 0;
494       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_l, &col_pos_l[y + sample_off_y]) >> shift2;
495       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
496       filtered[0][y * dst_stride + x] = sample;
497     }
498     for (x = !off_x_fir_l; x < width; ++x) {
499       int ypos = y + sample_off_y;
500       int xpos = x - !off_x_fir_l;
501       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_l, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
502       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
503       filtered[0][y * dst_stride + x] = sample;
504     }
505   }
506 
507   // Right QPEL (3/4 or 1/4 x positions)
508   for (y = 0; y < height; ++y) {
509     if (!off_x_fir_r) {
510       x = 0;
511       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_r, &col_pos_r[y + sample_off_y]) >> shift2;
512       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
513       filtered[1][y * dst_stride + x] = sample;
514     }
515     for (x = !off_x_fir_r; x < width; ++x) {
516       int ypos = y + sample_off_y;
517       int xpos = x - !off_x_fir_r;
518       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_r, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
519       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
520       filtered[1][y * dst_stride + x] = sample;
521     }
522   }
523 
524   // Top QPEL (1/4 or 3/4 y positions)
525   int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
526   for (y = 0; y < height; ++y) {
527     if (!sample_off_x) {
528       x = 0;
529       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_hor[y + off_y_fir_t]) >> shift2;
530       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
531       filtered[2][y * dst_stride + x] = sample;
532     }
533     for (x = !sample_off_x; x < width; ++x) {
534       int ypos = y + off_y_fir_t;
535       int xpos = x - !sample_off_x;
536       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2;
537       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
538       filtered[2][y * dst_stride + x] = sample;
539     }
540   }
541 
542   // Bottom QPEL (3/4 or 1/4 y positions)
543   for (y = 0; y < height; ++y) {
544     if (!sample_off_x) {
545       x = 0;
546       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_hor[y + off_y_fir_b]) >> shift2;
547       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
548       filtered[3][y * dst_stride + x] = sample;
549     }
550     for (x = !sample_off_x; x < width; ++x) {
551       int ypos = y + off_y_fir_b;
552       int xpos = x - !sample_off_x;
553       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2;
554       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
555       filtered[3][y * dst_stride + x] = sample;
556     }
557   }
558 }
559 
kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)560 void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
561   kvz_pixel *src,
562   int16_t src_stride,
563   int width,
564   int height,
565   kvz_pixel filtered[4][LCU_LUMA_SIZE],
566   int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
567   int8_t fme_level,
568   int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
569   int8_t hpel_off_x, int8_t hpel_off_y)
570 {
571   int x, y;
572 
573   // Interpolation filter shifts
574   int32_t shift2 = 6;
575 
576   // Weighted prediction offset and shift
577   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
578   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
579 
580   int8_t *fir1 = kvz_g_luma_filter[1];
581   int8_t *fir3 = kvz_g_luma_filter[3];
582 
583   // Horiziontal positions.
584   int16_t *hor_pos_l = hor_intermediate[3];
585   int16_t *hor_pos_r = hor_intermediate[4];
586 
587   int16_t *col_pos_l = hor_first_cols[1];
588   int16_t *col_pos_r = hor_first_cols[3];
589 
590   int16_t dst_stride = LCU_WIDTH;
591   int16_t hor_stride = LCU_WIDTH;
592 
593   // VERTICAL STEP
594   int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
595   int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
596 
597   // Specify if integer pixels are filtered from left or/and top integer samples
598   int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
599   int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
600   int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
601   int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
602 
603   // Top-left QPEL
604   for (y = 0; y < height; ++y) {
605     if (!off_x_fir_l) {
606       x = 0;
607       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_l[y + off_y_fir_t]) >> shift2;
608       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
609       filtered[0][y * dst_stride + x] = sample;
610     }
611     for (x = !off_x_fir_l; x < width; ++x) {
612       int ypos = y + off_y_fir_t;
613       int xpos = x - !off_x_fir_l;
614       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
615       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
616       filtered[0][y * dst_stride + x] = sample;
617     }
618   }
619 
620   // Top-right QPEL
621   for (y = 0; y < height; ++y) {
622     if (!off_x_fir_r) {
623       x = 0;
624       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_r[y + off_y_fir_t]) >> shift2;
625       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
626       filtered[1][y * dst_stride + x] = sample;
627     }
628     for (x = !off_x_fir_r; x < width; ++x) {
629       int ypos = y + off_y_fir_t;
630       int xpos = x - !off_x_fir_r;
631       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
632       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
633       filtered[1][y * dst_stride + x] = sample;
634     }
635   }
636 
637   // Bottom-left QPEL
638   for (y = 0; y < height; ++y) {
639     if (!off_x_fir_l) {
640       x = 0;
641       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_l[y + off_y_fir_b]) >> shift2;
642       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
643       filtered[2][y * dst_stride + x] = sample;
644     }
645     for (x = !off_x_fir_l; x < width; ++x) {
646       int ypos = y + off_y_fir_b;
647       int xpos = x - !off_x_fir_l;
648       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
649       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
650       filtered[2][y * dst_stride + x] = sample;
651     }
652   }
653 
654   // Bottom-right QPEL
655   for (y = 0; y < height; ++y) {
656     if (!off_x_fir_r) {
657       x = 0;
658       int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_r[y + off_y_fir_b]) >> shift2;
659       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
660       filtered[3][y * dst_stride + x] = sample;
661     }
662     for (x = !off_x_fir_r; x < width; ++x) {
663       int ypos = y + off_y_fir_b;
664       int xpos = x - !off_x_fir_r;
665       int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
666       sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
667       filtered[3][y * dst_stride + x] = sample;
668     }
669   }
670 }
671 
kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])672 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
673 {
674   //TODO: horizontal and vertical only filtering
675   int32_t x, y;
676 
677   // Interpolation filter shifts
678   int16_t shift1 = KVZ_BIT_DEPTH - 8;
679   int32_t shift2 = 6;
680 
681   // Weighted prediction offset and shift
682   int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
683   int32_t wp_offset1 = 1 << (wp_shift1 - 1);
684 
685   // Select filters according to the fractional part of the x and y mv components
686   int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7];
687   int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7];
688 
689   int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C];
690   int16_t hor_stride = LCU_WIDTH_C;
691 
692   // Filter horizontally
693   for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
694     for (x = 0; x < width; ++x) {
695       int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
696       int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
697       hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
698     }
699   }
700 
701   // Filter vertically
702   for (y = 0; y < height; ++y) {
703     for (x = 0; x < width; ++x) {
704       dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
705     }
706   }
707 }
708 
kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,int16_t * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])709 void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
710 {
711   //TODO: horizontal and vertical only filtering
712   int32_t x, y;
713 
714   // Interpolation filter shifts
715   int16_t shift1 = KVZ_BIT_DEPTH - 8;
716   int32_t shift2 = 6;
717 
718   // Select filters according to the fractional part of the x and y mv components
719   int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7];
720   int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7];
721 
722   int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C];
723   int16_t hor_stride = LCU_WIDTH_C;
724 
725   // Filter horizontally
726   for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
727     for (x = 0; x < width; ++x) {
728       int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
729       int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
730       hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
731     }
732   }
733 
734   // Filter vertically
735   for (y = 0; y < height; ++y) {
736     for (x = 0; x < width; ++x) {
737       dst[y * dst_stride + x] = kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2;
738     }
739   }
740 }
741 
742 
kvz_get_extended_block_generic(kvz_epol_args * args)743 void kvz_get_extended_block_generic(kvz_epol_args *args) {
744 
745   int min_y = args->blk_y - args->pad_t;
746   int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
747   bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
748 
749   int min_x = args->blk_x - args->pad_l;
750   int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
751   bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
752 
753   if (out_of_bounds_y || out_of_bounds_x) {
754 
755     *args->ext = args->buf;
756     *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
757     *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
758 
759     // Note that stride equals width here.
760     int cnt_l = CLIP(0, *args->ext_s, -min_x);
761     int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
762     int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
763 
764     // For each row including real padding.
765     // Don't read "don't care" values (SIMD padding). Zero them out.
766     int y;
767     for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
768 
769       int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
770       kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
771       kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
772       kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
773       kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
774       kvz_pixel *dst_m = dst_l + cnt_l;
775       kvz_pixel *dst_r = dst_m + cnt_m;
776       for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
777       for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
778       for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
779     }
780 
781     for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
782       kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
783       FILL_ARRAY(dst, 0, *args->ext_s);
784     }
785 
786   } else {
787 
788     *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
789     *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
790     *args->ext_s = args->src_s;
791   }
792 }
793 
kvz_strategy_register_ipol_generic(void * opaque,uint8_t bitdepth)794 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
795 {
796   bool success = true;
797 
798   success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_hpel_blocks_hor_ver_luma_generic);
799   success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "generic", 0, &kvz_filter_hpel_blocks_diag_luma_generic);
800   success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_qpel_blocks_hor_ver_luma_generic);
801   success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
802   success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
803   success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
804   success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
805   success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
806   success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
807 
808   return success;
809 }
810