1 /*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33 #include "strategies/generic/ipol-generic.h"
34
35 #include <stdio.h>
36 #include <string.h>
37
38 #include "encoder.h"
39 #include "strategies/generic/picture-generic.h"
40 #include "strategies/strategies-ipol.h"
41 #include "strategyselector.h"
42
43 extern int8_t kvz_g_luma_filter[4][8];
44 extern int8_t kvz_g_chroma_filter[8][4];
45
kvz_eight_tap_filter_hor_generic(int8_t * filter,kvz_pixel * data)46 int32_t kvz_eight_tap_filter_hor_generic(int8_t *filter, kvz_pixel *data)
47 {
48 int32_t temp = 0;
49 for (int i = 0; i < 8; ++i)
50 {
51 temp += filter[i] * data[i];
52 }
53
54 return temp;
55 }
56
kvz_eight_tap_filter_hor_16bit_generic(int8_t * filter,int16_t * data)57 int32_t kvz_eight_tap_filter_hor_16bit_generic(int8_t *filter, int16_t *data)
58 {
59 int32_t temp = 0;
60 for (int i = 0; i < 8; ++i)
61 {
62 temp += filter[i] * data[i];
63 }
64
65 return temp;
66 }
67
kvz_eight_tap_filter_ver_generic(int8_t * filter,kvz_pixel * data,int16_t stride)68 int32_t kvz_eight_tap_filter_ver_generic(int8_t *filter, kvz_pixel *data, int16_t stride)
69 {
70 int32_t temp = 0;
71 for (int i = 0; i < 8; ++i)
72 {
73 temp += filter[i] * data[stride * i];
74 }
75
76 return temp;
77 }
78
kvz_eight_tap_filter_ver_16bit_generic(int8_t * filter,int16_t * data,int16_t stride)79 int32_t kvz_eight_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t stride)
80 {
81 int32_t temp = 0;
82 for (int i = 0; i < 8; ++i)
83 {
84 temp += filter[i] * data[stride * i];
85 }
86
87 return temp;
88 }
89
kvz_four_tap_filter_hor_generic(int8_t * filter,kvz_pixel * data)90 int32_t kvz_four_tap_filter_hor_generic(int8_t *filter, kvz_pixel *data)
91 {
92 int32_t temp = 0;
93 for (int i = 0; i < 4; ++i)
94 {
95 temp += filter[i] * data[i];
96 }
97
98 return temp;
99 }
100
kvz_four_tap_filter_hor_16bit_generic(int8_t * filter,int16_t * data)101 int32_t kvz_four_tap_filter_hor_16bit_generic(int8_t *filter, int16_t *data)
102 {
103 int32_t temp = 0;
104 for (int i = 0; i < 4; ++i)
105 {
106 temp += filter[i] * data[i];
107 }
108
109 return temp;
110 }
111
kvz_four_tap_filter_ver_generic(int8_t * filter,kvz_pixel * data,int16_t stride)112 int32_t kvz_four_tap_filter_ver_generic(int8_t *filter, kvz_pixel *data, int16_t stride)
113 {
114 int32_t temp = 0;
115 for (int i = 0; i < 4; ++i)
116 {
117 temp += filter[i] * data[stride * i];
118 }
119
120 return temp;
121 }
122
kvz_four_tap_filter_ver_16bit_generic(int8_t * filter,int16_t * data,int16_t stride)123 int32_t kvz_four_tap_filter_ver_16bit_generic(int8_t *filter, int16_t *data, int16_t stride)
124 {
125 int32_t temp = 0;
126 for (int i = 0; i < 4; ++i)
127 {
128 temp += filter[i] * data[stride * i];
129 }
130
131 return temp;
132 }
133
kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])134 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
135 {
136 //TODO: horizontal and vertical only filtering
137 int32_t x, y;
138
139 // Interpolation filter shifts
140 int16_t shift1 = KVZ_BIT_DEPTH - 8;
141 int32_t shift2 = 6;
142
143 // Weighted prediction offset and shift
144 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
145 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
146
147 // Select filters according to the fractional part of the x and y mv components
148 int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
149 int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
150
151 int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
152 int16_t hor_stride = LCU_WIDTH;
153
154 // Filter horizontally
155 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
156 for (x = 0; x < width; ++x) {
157 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
158 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
159 hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
160 }
161 }
162
163 // Filter vertically
164 for (y = 0; y < height; ++y) {
165 for (x = 0; x < width; ++x) {
166 dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
167 }
168 }
169 }
170
kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,int16_t * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])171 void kvz_sample_quarterpel_luma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
172 {
173 //TODO: horizontal and vertical only filtering
174 int32_t x, y;
175
176 // Interpolation filter shifts
177 int16_t shift1 = KVZ_BIT_DEPTH - 8;
178 int32_t shift2 = 6;
179
180 // Select filters according to the fractional part of the x and y mv components
181 int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3];
182 int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3];
183
184 int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH];
185 int16_t hor_stride = LCU_WIDTH;
186
187 // Filter horizontally
188 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
189 for (x = 0; x < width; ++x) {
190 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
191 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
192 hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
193 }
194 }
195
196 // Filter vertically
197 for (y = 0; y < height; ++y) {
198 for (x = 0; x < width; ++x) {
199 dst[y * dst_stride + x] = kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2;
200 }
201 }
202 }
203
kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)204 void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,
205 kvz_pixel *src,
206 int16_t src_stride,
207 int width,
208 int height,
209 kvz_pixel filtered[4][LCU_LUMA_SIZE],
210 int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
211 int8_t fme_level,
212 int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
213 int8_t hpel_off_x, int8_t hpel_off_y)
214 {
215 int x, y, first_y;
216
217 // Interpolation filter shifts
218 int16_t shift1 = KVZ_BIT_DEPTH - 8;
219
220 // Weighted prediction offset and shift
221 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
222 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
223
224 int8_t *fir0 = kvz_g_luma_filter[0];
225 int8_t *fir2 = kvz_g_luma_filter[2];
226
227 int16_t dst_stride = LCU_WIDTH;
228 int16_t hor_stride = LCU_WIDTH;
229 int32_t first_row_offset = (KVZ_LUMA_FILTER_OFFSET + 1) * hor_stride;
230
231 int16_t *col_pos0 = hor_first_cols[0];
232 int16_t *col_pos2 = hor_first_cols[2];
233
234 // Horizontally filtered samples from the top row are
235 // not needed unless samples for diagonal positions are filtered later.
236 first_y = fme_level > 1 ? 0 : 1;
237
238 // HORIZONTAL STEP
239 // Integer pixels
240 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
241 for (x = 0; x < width; ++x) {
242 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
243 int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
244 hor_intermediate[0][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
245 }
246 }
247
248 // Write the first column in contiguous memory
249 x = 0;
250 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
251 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
252 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
253 col_pos0[y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1;
254 }
255
256 // Half pixels
257 for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
258 for (x = 0; x < width; ++x) {
259 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
260 int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
261 hor_intermediate[1][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
262 }
263 }
264
265 // Write the first column in contiguous memory
266 x = 0;
267 for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
268 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
269 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
270 col_pos2[y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1;
271 }
272
273 // VERTICAL STEP
274
275 // Right
276 // Only horizontal filter
277 for (y = 0; y < height; ++y) {
278 for (x = 0; x < width; ++x) {
279 filtered[1][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((hor_intermediate[1][first_row_offset + y * hor_stride + x] + wp_offset1) >> wp_shift1);
280 }
281 }
282
283 // Left
284 // Copy from the right filtered block and the extra column
285 for (y = 0; y < height; ++y) {
286 x = 0;
287 filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1);
288 for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
289 }
290
291 // Top
292 // Only vertical filter
293 for (y = 0; y < height; ++y) {
294 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
295 for (x = 0; x < width; ++x) {
296 int xpos = x;
297 int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*ypos + xpos + 1], src_stride) >> shift1;
298 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
299 filtered[2][y * dst_stride + x] = sample;
300 }
301 }
302
303 // Bottom
304 // Copy what can be copied from the top filtered values.
305 // Then filter the last row from horizontal intermediate buffer.
306 for (y = 0; y < height - 1; ++y) {
307 for (x = 0; x < width; ++x) filtered[3][y * dst_stride + x] = filtered[2][(y + 1) * dst_stride + x];
308 }
309
310 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
311 for (x = 0; x < width; ++x) {
312 int xpos = x;
313 int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*(ypos + 1) + xpos + 1], src_stride) >> shift1;
314 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
315 filtered[3][y * dst_stride + x] = sample;
316 }
317 }
318
kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)319 void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
320 kvz_pixel *src,
321 int16_t src_stride,
322 int width,
323 int height,
324 kvz_pixel filtered[4][LCU_LUMA_SIZE],
325 int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
326 int8_t fme_level,
327 int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
328 int8_t hpel_off_x, int8_t hpel_off_y)
329 {
330 int x, y;
331
332 // Interpolation filter shifts
333 int32_t shift2 = 6;
334
335 // Weighted prediction offset and shift
336 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
337 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
338
339 int8_t *fir2 = kvz_g_luma_filter[2];
340
341 int16_t dst_stride = LCU_WIDTH;
342 int16_t hor_stride = LCU_WIDTH;
343
344 // Horizontal positions
345 int16_t *col_pos2 = hor_first_cols[2];
346
347 // VERTICAL STEP
348
349 // Top-right
350 for (y = 0; y < height; ++y) {
351 for (x = 0; x < width; ++x) {
352 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][y * hor_stride + x], hor_stride) >> shift2;
353 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
354 filtered[1][y * dst_stride + x] = sample;
355 }
356 }
357
358 for (y = 0; y < height; ++y) {
359 x = 0;
360 filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1);
361 for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
362 }
363
364 // Top-left
365 // Copy what can be copied from top-right filtered values. Filter the first column from the column array.
366 for (y = 0; y < height; ++y) {
367 x = 0;
368 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[y]) >> shift2;
369 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
370 filtered[0][y * dst_stride + x] = sample;
371 for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1];
372 }
373
374 // Bottom-right
375 // Copy what can be copied from top-right filtered values. Filter the last row.
376 for (y = 0; y < height - 1; ++y) {
377 for (x = 0; x < width; ++x) filtered[3][y* dst_stride + x] = filtered[1][(y + 1) * dst_stride + x];
378 }
379
380 for (x = 0; x < width; ++x) {
381 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][(y + 1) * hor_stride + x], hor_stride) >> shift2;
382 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
383 filtered[3][y * dst_stride + x] = sample;
384 }
385
386 // Bottom-left
387 // Copy what can be copied from the top-left filtered values.
388 // Copy what can be copied from the bottom-right filtered values.
389 // Finally filter the last pixel from the column array.
390 for (y = 0; y < height - 1; ++y) {
391 for (x = 0; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[0][(y + 1) * dst_stride + x];
392 }
393 for (x = 1; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[3][y * dst_stride + x - 1];
394 x = 0;
395 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[(y + 1)]) >> shift2;
396 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
397 filtered[2][y * dst_stride + x] = sample;
398 }
399
kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)400 void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,
401 kvz_pixel *src,
402 int16_t src_stride,
403 int width,
404 int height,
405 kvz_pixel filtered[4][LCU_LUMA_SIZE],
406 int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
407 int8_t fme_level,
408 int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
409 int8_t hpel_off_x, int8_t hpel_off_y)
410 {
411 int x, y;
412
413 // Interpolation filter shifts
414 int16_t shift1 = KVZ_BIT_DEPTH - 8;
415 int32_t shift2 = 6;
416
417 // Weighted prediction offset and shift
418 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
419 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
420
421 int8_t *fir0 = kvz_g_luma_filter[0];
422 int8_t *fir2 = kvz_g_luma_filter[2];
423 int8_t *fir1 = kvz_g_luma_filter[1];
424 int8_t *fir3 = kvz_g_luma_filter[3];
425
426 // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered.
427 int16_t *hor_pos0 = hor_intermediate[0];
428 int16_t *hor_pos2 = hor_intermediate[1];
429 int16_t *hor_pos_l = hor_intermediate[3];
430 int16_t *hor_pos_r = hor_intermediate[4];
431 int8_t *hor_fir_l = hpel_off_x != 0 ? fir1 : fir3;
432 int8_t *hor_fir_r = hpel_off_x != 0 ? fir3 : fir1;
433 int16_t *col_pos_l = hor_first_cols[1];
434 int16_t *col_pos_r = hor_first_cols[3];
435
436 int16_t dst_stride = LCU_WIDTH;
437 int16_t hor_stride = LCU_WIDTH;
438
439 int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0;
440 int16_t *col_pos_hor = hpel_off_x != 0 ? hor_first_cols[2] : hor_first_cols[0];
441
442 // Specify if integer pixels are filtered from left or/and top integer samples
443 int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
444 int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
445 int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
446 int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
447
448 // HORIZONTAL STEP
449 // Left QPEL
450 int sample_off_y = hpel_off_y < 0 ? 0 : 1;
451 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
452 for (x = 0; x < width; ++x) {
453 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
454 int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
455 hor_pos_l[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1;
456 }
457 }
458
459 // Write the first column in contiguous memory
460 x = 0;
461 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
462 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
463 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
464 col_pos_l[y] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1;
465 }
466
467 // Right QPEL
468 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
469 for (x = 0; x < width; ++x) {
470 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
471 int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
472 hor_pos_r[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1;
473 }
474 }
475
476 // Write the first column in contiguous memory
477 x = 0;
478 for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
479 int ypos = y - KVZ_LUMA_FILTER_OFFSET;
480 int xpos = x - KVZ_LUMA_FILTER_OFFSET;
481 col_pos_r[y] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1;
482 }
483
484 // VERTICAL STEP
485 int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
486 int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
487 int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
488 int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
489
490 // Left QPEL (1/4 or 3/4 x positions)
491 for (y = 0; y < height; ++y) {
492 if (!off_x_fir_l) {
493 x = 0;
494 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_l, &col_pos_l[y + sample_off_y]) >> shift2;
495 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
496 filtered[0][y * dst_stride + x] = sample;
497 }
498 for (x = !off_x_fir_l; x < width; ++x) {
499 int ypos = y + sample_off_y;
500 int xpos = x - !off_x_fir_l;
501 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_l, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
502 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
503 filtered[0][y * dst_stride + x] = sample;
504 }
505 }
506
507 // Right QPEL (3/4 or 1/4 x positions)
508 for (y = 0; y < height; ++y) {
509 if (!off_x_fir_r) {
510 x = 0;
511 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_r, &col_pos_r[y + sample_off_y]) >> shift2;
512 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
513 filtered[1][y * dst_stride + x] = sample;
514 }
515 for (x = !off_x_fir_r; x < width; ++x) {
516 int ypos = y + sample_off_y;
517 int xpos = x - !off_x_fir_r;
518 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_r, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
519 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
520 filtered[1][y * dst_stride + x] = sample;
521 }
522 }
523
524 // Top QPEL (1/4 or 3/4 y positions)
525 int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
526 for (y = 0; y < height; ++y) {
527 if (!sample_off_x) {
528 x = 0;
529 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_hor[y + off_y_fir_t]) >> shift2;
530 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
531 filtered[2][y * dst_stride + x] = sample;
532 }
533 for (x = !sample_off_x; x < width; ++x) {
534 int ypos = y + off_y_fir_t;
535 int xpos = x - !sample_off_x;
536 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2;
537 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
538 filtered[2][y * dst_stride + x] = sample;
539 }
540 }
541
542 // Bottom QPEL (3/4 or 1/4 y positions)
543 for (y = 0; y < height; ++y) {
544 if (!sample_off_x) {
545 x = 0;
546 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_hor[y + off_y_fir_b]) >> shift2;
547 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
548 filtered[3][y * dst_stride + x] = sample;
549 }
550 for (x = !sample_off_x; x < width; ++x) {
551 int ypos = y + off_y_fir_b;
552 int xpos = x - !sample_off_x;
553 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2;
554 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
555 filtered[3][y * dst_stride + x] = sample;
556 }
557 }
558 }
559
kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel filtered[4][LCU_LUMA_SIZE],int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],int8_t fme_level,int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA+1],int8_t hpel_off_x,int8_t hpel_off_y)560 void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
561 kvz_pixel *src,
562 int16_t src_stride,
563 int width,
564 int height,
565 kvz_pixel filtered[4][LCU_LUMA_SIZE],
566 int16_t hor_intermediate[5][KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD],
567 int8_t fme_level,
568 int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1],
569 int8_t hpel_off_x, int8_t hpel_off_y)
570 {
571 int x, y;
572
573 // Interpolation filter shifts
574 int32_t shift2 = 6;
575
576 // Weighted prediction offset and shift
577 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
578 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
579
580 int8_t *fir1 = kvz_g_luma_filter[1];
581 int8_t *fir3 = kvz_g_luma_filter[3];
582
583 // Horiziontal positions.
584 int16_t *hor_pos_l = hor_intermediate[3];
585 int16_t *hor_pos_r = hor_intermediate[4];
586
587 int16_t *col_pos_l = hor_first_cols[1];
588 int16_t *col_pos_r = hor_first_cols[3];
589
590 int16_t dst_stride = LCU_WIDTH;
591 int16_t hor_stride = LCU_WIDTH;
592
593 // VERTICAL STEP
594 int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
595 int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
596
597 // Specify if integer pixels are filtered from left or/and top integer samples
598 int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
599 int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
600 int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
601 int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
602
603 // Top-left QPEL
604 for (y = 0; y < height; ++y) {
605 if (!off_x_fir_l) {
606 x = 0;
607 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_l[y + off_y_fir_t]) >> shift2;
608 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
609 filtered[0][y * dst_stride + x] = sample;
610 }
611 for (x = !off_x_fir_l; x < width; ++x) {
612 int ypos = y + off_y_fir_t;
613 int xpos = x - !off_x_fir_l;
614 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
615 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
616 filtered[0][y * dst_stride + x] = sample;
617 }
618 }
619
620 // Top-right QPEL
621 for (y = 0; y < height; ++y) {
622 if (!off_x_fir_r) {
623 x = 0;
624 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_r[y + off_y_fir_t]) >> shift2;
625 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
626 filtered[1][y * dst_stride + x] = sample;
627 }
628 for (x = !off_x_fir_r; x < width; ++x) {
629 int ypos = y + off_y_fir_t;
630 int xpos = x - !off_x_fir_r;
631 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
632 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
633 filtered[1][y * dst_stride + x] = sample;
634 }
635 }
636
637 // Bottom-left QPEL
638 for (y = 0; y < height; ++y) {
639 if (!off_x_fir_l) {
640 x = 0;
641 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_l[y + off_y_fir_b]) >> shift2;
642 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
643 filtered[2][y * dst_stride + x] = sample;
644 }
645 for (x = !off_x_fir_l; x < width; ++x) {
646 int ypos = y + off_y_fir_b;
647 int xpos = x - !off_x_fir_l;
648 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2;
649 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
650 filtered[2][y * dst_stride + x] = sample;
651 }
652 }
653
654 // Bottom-right QPEL
655 for (y = 0; y < height; ++y) {
656 if (!off_x_fir_r) {
657 x = 0;
658 int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_r[y + off_y_fir_b]) >> shift2;
659 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
660 filtered[3][y * dst_stride + x] = sample;
661 }
662 for (x = !off_x_fir_r; x < width; ++x) {
663 int ypos = y + off_y_fir_b;
664 int xpos = x - !off_x_fir_r;
665 int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2;
666 sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
667 filtered[3][y * dst_stride + x] = sample;
668 }
669 }
670 }
671
kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,kvz_pixel * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])672 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
673 {
674 //TODO: horizontal and vertical only filtering
675 int32_t x, y;
676
677 // Interpolation filter shifts
678 int16_t shift1 = KVZ_BIT_DEPTH - 8;
679 int32_t shift2 = 6;
680
681 // Weighted prediction offset and shift
682 int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
683 int32_t wp_offset1 = 1 << (wp_shift1 - 1);
684
685 // Select filters according to the fractional part of the x and y mv components
686 int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7];
687 int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7];
688
689 int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C];
690 int16_t hor_stride = LCU_WIDTH_C;
691
692 // Filter horizontally
693 for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
694 for (x = 0; x < width; ++x) {
695 int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
696 int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
697 hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
698 }
699 }
700
701 // Filter vertically
702 for (y = 0; y < height; ++y) {
703 for (x = 0; x < width; ++x) {
704 dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
705 }
706 }
707 }
708
kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder,kvz_pixel * src,int16_t src_stride,int width,int height,int16_t * dst,int16_t dst_stride,int8_t hor_flag,int8_t ver_flag,const int16_t mv[2])709 void kvz_sample_octpel_chroma_hi_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2])
710 {
711 //TODO: horizontal and vertical only filtering
712 int32_t x, y;
713
714 // Interpolation filter shifts
715 int16_t shift1 = KVZ_BIT_DEPTH - 8;
716 int32_t shift2 = 6;
717
718 // Select filters according to the fractional part of the x and y mv components
719 int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7];
720 int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7];
721
722 int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C];
723 int16_t hor_stride = LCU_WIDTH_C;
724
725 // Filter horizontally
726 for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
727 for (x = 0; x < width; ++x) {
728 int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
729 int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
730 hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1;
731 }
732 }
733
734 // Filter vertically
735 for (y = 0; y < height; ++y) {
736 for (x = 0; x < width; ++x) {
737 dst[y * dst_stride + x] = kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2;
738 }
739 }
740 }
741
742
kvz_get_extended_block_generic(kvz_epol_args * args)743 void kvz_get_extended_block_generic(kvz_epol_args *args) {
744
745 int min_y = args->blk_y - args->pad_t;
746 int max_y = args->blk_y + args->blk_h + args->pad_b + args->pad_b_simd - 1;
747 bool out_of_bounds_y = (min_y < 0) || (max_y >= args->src_h);
748
749 int min_x = args->blk_x - args->pad_l;
750 int max_x = args->blk_x + args->blk_w + args->pad_r - 1;
751 bool out_of_bounds_x = (min_x < 0) || (max_x >= args->src_w);
752
753 if (out_of_bounds_y || out_of_bounds_x) {
754
755 *args->ext = args->buf;
756 *args->ext_s = args->pad_l + args->blk_w + args->pad_r;
757 *args->ext_origin = args->buf + args->pad_t * (*args->ext_s) + args->pad_l;
758
759 // Note that stride equals width here.
760 int cnt_l = CLIP(0, *args->ext_s, -min_x);
761 int cnt_r = CLIP(0, *args->ext_s, max_x - (args->src_w - 1));
762 int cnt_m = CLIP(0, *args->ext_s, *args->ext_s - cnt_l - cnt_r);
763
764 // For each row including real padding.
765 // Don't read "don't care" values (SIMD padding). Zero them out.
766 int y;
767 for (y = -args->pad_t; y < args->blk_h + args->pad_b; ++y) {
768
769 int clipped_y = CLIP(0, args->src_h - 1, args->blk_y + y);
770 kvz_pixel *sample_l = args->src + clipped_y * args->src_s;
771 kvz_pixel *sample_r = args->src + clipped_y * args->src_s + args->src_w - 1;
772 kvz_pixel *src_m = args->src + clipped_y * args->src_s + MAX(min_x, 0);
773 kvz_pixel *dst_l = args->buf + (y + args->pad_t) * (*args->ext_s);
774 kvz_pixel *dst_m = dst_l + cnt_l;
775 kvz_pixel *dst_r = dst_m + cnt_m;
776 for (int i = 0; i < cnt_l; ++i) *(dst_l + i) = *sample_l;
777 for (int i = 0; i < cnt_m; ++i) *(dst_m + i) = *(src_m + i);
778 for (int i = 0; i < cnt_r; ++i) *(dst_r + i) = *sample_r;
779 }
780
781 for (int y_simd = 0; y_simd < args->pad_b_simd; ++y_simd) {
782 kvz_pixel *dst = args->buf + (y + args->pad_t + y_simd) * (*args->ext_s);
783 FILL_ARRAY(dst, 0, *args->ext_s);
784 }
785
786 } else {
787
788 *args->ext = args->src + (args->blk_y - args->pad_t) * args->src_s + (args->blk_x - args->pad_l);
789 *args->ext_origin = args->src + args->blk_y * args->src_s + args->blk_x;
790 *args->ext_s = args->src_s;
791 }
792 }
793
kvz_strategy_register_ipol_generic(void * opaque,uint8_t bitdepth)794 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
795 {
796 bool success = true;
797
798 success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_hpel_blocks_hor_ver_luma_generic);
799 success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "generic", 0, &kvz_filter_hpel_blocks_diag_luma_generic);
800 success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_qpel_blocks_hor_ver_luma_generic);
801 success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
802 success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
803 success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
804 success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma_hi", "generic", 0, &kvz_sample_quarterpel_luma_hi_generic);
805 success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma_hi", "generic", 0, &kvz_sample_octpel_chroma_hi_generic);
806 success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
807
808 return success;
809 }
810