1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "common/attributes.h"
34 #include "common/intops.h"
35
36 #include "src/mc.h"
37 #include "src/tables.h"
38
39 #if BITDEPTH == 8
40 #define get_intermediate_bits(bitdepth_max) 4
41 // Output in interval [-5132, 9212], fits in int16_t as is
42 #define PREP_BIAS 0
43 #else
44 // 4 for 10 bits/component, 2 for 12 bits/component
45 #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
46 // Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
47 // Subtract a bias to ensure the output fits in int16_t
48 #define PREP_BIAS 8192
49 #endif
50
51 static NOINLINE void
put_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int w,int h)52 put_c(pixel *dst, const ptrdiff_t dst_stride,
53 const pixel *src, const ptrdiff_t src_stride, const int w, int h)
54 {
55 do {
56 pixel_copy(dst, src, w);
57
58 dst += dst_stride;
59 src += src_stride;
60 } while (--h);
61 }
62
63 static NOINLINE void
prep_c(int16_t * tmp,const pixel * src,const ptrdiff_t src_stride,const int w,int h HIGHBD_DECL_SUFFIX)64 prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
65 const int w, int h HIGHBD_DECL_SUFFIX)
66 {
67 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
68 do {
69 for (int x = 0; x < w; x++)
70 tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
71
72 tmp += w;
73 src += src_stride;
74 } while (--h);
75 }
76
77 #define FILTER_8TAP(src, x, F, stride) \
78 (F[0] * src[x + -3 * stride] + \
79 F[1] * src[x + -2 * stride] + \
80 F[2] * src[x + -1 * stride] + \
81 F[3] * src[x + +0 * stride] + \
82 F[4] * src[x + +1 * stride] + \
83 F[5] * src[x + +2 * stride] + \
84 F[6] * src[x + +3 * stride] + \
85 F[7] * src[x + +4 * stride])
86
87 #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
88 ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
89
90 #define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
91 ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
92
93 #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
94 iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
95
96 #define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
97 iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
98
99 #define GET_H_FILTER(mx) \
100 const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
101 dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
102 dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
103
104 #define GET_V_FILTER(my) \
105 const int8_t *const fv = !(my) ? NULL : h > 4 ? \
106 dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
107 dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
108
109 #define GET_FILTERS() \
110 GET_H_FILTER(mx); \
111 GET_V_FILTER(my)
112
113 static NOINLINE void
put_8tap_c(pixel * dst,ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my,const int filter_type HIGHBD_DECL_SUFFIX)114 put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
115 const pixel *src, ptrdiff_t src_stride,
116 const int w, int h, const int mx, const int my,
117 const int filter_type HIGHBD_DECL_SUFFIX)
118 {
119 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
120 const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
121
122 GET_FILTERS();
123 dst_stride = PXSTRIDE(dst_stride);
124 src_stride = PXSTRIDE(src_stride);
125
126 if (fh) {
127 if (fv) {
128 int tmp_h = h + 7;
129 int16_t mid[128 * 135], *mid_ptr = mid;
130
131 src -= src_stride * 3;
132 do {
133 for (int x = 0; x < w; x++)
134 mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
135 6 - intermediate_bits);
136
137 mid_ptr += 128;
138 src += src_stride;
139 } while (--tmp_h);
140
141 mid_ptr = mid + 128 * 3;
142 do {
143 for (int x = 0; x < w; x++)
144 dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
145 6 + intermediate_bits);
146
147 mid_ptr += 128;
148 dst += dst_stride;
149 } while (--h);
150 } else {
151 do {
152 for (int x = 0; x < w; x++) {
153 dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
154 intermediate_rnd, 6);
155 }
156
157 dst += dst_stride;
158 src += src_stride;
159 } while (--h);
160 }
161 } else if (fv) {
162 do {
163 for (int x = 0; x < w; x++)
164 dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
165
166 dst += dst_stride;
167 src += src_stride;
168 } while (--h);
169 } else
170 put_c(dst, dst_stride, src, src_stride, w, h);
171 }
172
173 static NOINLINE void
put_8tap_scaled_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy,const int filter_type HIGHBD_DECL_SUFFIX)174 put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
175 const pixel *src, ptrdiff_t src_stride,
176 const int w, int h, const int mx, int my,
177 const int dx, const int dy, const int filter_type
178 HIGHBD_DECL_SUFFIX)
179 {
180 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
181 const int intermediate_rnd = (1 << intermediate_bits) >> 1;
182 int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
183 int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
184 src_stride = PXSTRIDE(src_stride);
185
186 src -= src_stride * 3;
187 do {
188 int x;
189 int imx = mx, ioff = 0;
190
191 for (x = 0; x < w; x++) {
192 GET_H_FILTER(imx >> 6);
193 mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
194 6 - intermediate_bits) :
195 src[ioff] << intermediate_bits;
196 imx += dx;
197 ioff += imx >> 10;
198 imx &= 0x3ff;
199 }
200
201 mid_ptr += 128;
202 src += src_stride;
203 } while (--tmp_h);
204
205 mid_ptr = mid + 128 * 3;
206 for (int y = 0; y < h; y++) {
207 int x;
208 GET_V_FILTER(my >> 6);
209
210 for (x = 0; x < w; x++)
211 dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
212 6 + intermediate_bits) :
213 iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
214 intermediate_bits);
215
216 my += dy;
217 mid_ptr += (my >> 10) * 128;
218 my &= 0x3ff;
219 dst += PXSTRIDE(dst_stride);
220 }
221 }
222
223 static NOINLINE void
prep_8tap_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my,const int filter_type HIGHBD_DECL_SUFFIX)224 prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
225 const int w, int h, const int mx, const int my,
226 const int filter_type HIGHBD_DECL_SUFFIX)
227 {
228 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
229 GET_FILTERS();
230 src_stride = PXSTRIDE(src_stride);
231
232 if (fh) {
233 if (fv) {
234 int tmp_h = h + 7;
235 int16_t mid[128 * 135], *mid_ptr = mid;
236
237 src -= src_stride * 3;
238 do {
239 for (int x = 0; x < w; x++)
240 mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
241 6 - intermediate_bits);
242
243 mid_ptr += 128;
244 src += src_stride;
245 } while (--tmp_h);
246
247 mid_ptr = mid + 128 * 3;
248 do {
249 for (int x = 0; x < w; x++) {
250 int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
251 PREP_BIAS;
252 assert(t >= INT16_MIN && t <= INT16_MAX);
253 tmp[x] = t;
254 }
255
256 mid_ptr += 128;
257 tmp += w;
258 } while (--h);
259 } else {
260 do {
261 for (int x = 0; x < w; x++)
262 tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
263 6 - intermediate_bits) -
264 PREP_BIAS;
265
266 tmp += w;
267 src += src_stride;
268 } while (--h);
269 }
270 } else if (fv) {
271 do {
272 for (int x = 0; x < w; x++)
273 tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
274 6 - intermediate_bits) -
275 PREP_BIAS;
276
277 tmp += w;
278 src += src_stride;
279 } while (--h);
280 } else
281 prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
282 }
283
284 static NOINLINE void
prep_8tap_scaled_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy,const int filter_type HIGHBD_DECL_SUFFIX)285 prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
286 const int w, int h, const int mx, int my,
287 const int dx, const int dy, const int filter_type
288 HIGHBD_DECL_SUFFIX)
289 {
290 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
291 int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
292 int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
293 src_stride = PXSTRIDE(src_stride);
294
295 src -= src_stride * 3;
296 do {
297 int x;
298 int imx = mx, ioff = 0;
299
300 for (x = 0; x < w; x++) {
301 GET_H_FILTER(imx >> 6);
302 mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
303 6 - intermediate_bits) :
304 src[ioff] << intermediate_bits;
305 imx += dx;
306 ioff += imx >> 10;
307 imx &= 0x3ff;
308 }
309
310 mid_ptr += 128;
311 src += src_stride;
312 } while (--tmp_h);
313
314 mid_ptr = mid + 128 * 3;
315 for (int y = 0; y < h; y++) {
316 int x;
317 GET_V_FILTER(my >> 6);
318
319 for (x = 0; x < w; x++)
320 tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
321 : mid_ptr[x]) - PREP_BIAS;
322
323 my += dy;
324 mid_ptr += (my >> 10) * 128;
325 my &= 0x3ff;
326 tmp += w;
327 }
328 }
329
330 #define filter_fns(type, type_h, type_v) \
331 static void put_8tap_##type##_c(pixel *const dst, \
332 const ptrdiff_t dst_stride, \
333 const pixel *const src, \
334 const ptrdiff_t src_stride, \
335 const int w, const int h, \
336 const int mx, const int my \
337 HIGHBD_DECL_SUFFIX) \
338 { \
339 put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
340 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
341 } \
342 static void put_8tap_##type##_scaled_c(pixel *const dst, \
343 const ptrdiff_t dst_stride, \
344 const pixel *const src, \
345 const ptrdiff_t src_stride, \
346 const int w, const int h, \
347 const int mx, const int my, \
348 const int dx, const int dy \
349 HIGHBD_DECL_SUFFIX) \
350 { \
351 put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
352 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
353 } \
354 static void prep_8tap_##type##_c(int16_t *const tmp, \
355 const pixel *const src, \
356 const ptrdiff_t src_stride, \
357 const int w, const int h, \
358 const int mx, const int my \
359 HIGHBD_DECL_SUFFIX) \
360 { \
361 prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
362 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
363 } \
364 static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
365 const pixel *const src, \
366 const ptrdiff_t src_stride, \
367 const int w, const int h, \
368 const int mx, const int my, \
369 const int dx, const int dy \
370 HIGHBD_DECL_SUFFIX) \
371 { \
372 prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
373 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
374 }
375
filter_fns(regular,DAV1D_FILTER_8TAP_REGULAR,DAV1D_FILTER_8TAP_REGULAR)376 filter_fns(regular, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
377 filter_fns(regular_sharp, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
378 filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
379 filter_fns(smooth, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SMOOTH)
380 filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_REGULAR)
381 filter_fns(smooth_sharp, DAV1D_FILTER_8TAP_SMOOTH, DAV1D_FILTER_8TAP_SHARP)
382 filter_fns(sharp, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SHARP)
383 filter_fns(sharp_regular, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_REGULAR)
384 filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH)
385
386 #define FILTER_BILIN(src, x, mxy, stride) \
387 (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
388
389 #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
390 ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
391
392 #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
393 iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
394
395 static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
396 const pixel *src, ptrdiff_t src_stride,
397 const int w, int h, const int mx, const int my
398 HIGHBD_DECL_SUFFIX)
399 {
400 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
401 const int intermediate_rnd = (1 << intermediate_bits) >> 1;
402 dst_stride = PXSTRIDE(dst_stride);
403 src_stride = PXSTRIDE(src_stride);
404
405 if (mx) {
406 if (my) {
407 int16_t mid[128 * 129], *mid_ptr = mid;
408 int tmp_h = h + 1;
409
410 do {
411 for (int x = 0; x < w; x++)
412 mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
413 4 - intermediate_bits);
414
415 mid_ptr += 128;
416 src += src_stride;
417 } while (--tmp_h);
418
419 mid_ptr = mid;
420 do {
421 for (int x = 0; x < w; x++)
422 dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
423 4 + intermediate_bits);
424
425 mid_ptr += 128;
426 dst += dst_stride;
427 } while (--h);
428 } else {
429 do {
430 for (int x = 0; x < w; x++) {
431 const int px = FILTER_BILIN_RND(src, x, mx, 1,
432 4 - intermediate_bits);
433 dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
434 }
435
436 dst += dst_stride;
437 src += src_stride;
438 } while (--h);
439 }
440 } else if (my) {
441 do {
442 for (int x = 0; x < w; x++)
443 dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
444
445 dst += dst_stride;
446 src += src_stride;
447 } while (--h);
448 } else
449 put_c(dst, dst_stride, src, src_stride, w, h);
450 }
451
put_bilin_scaled_c(pixel * dst,ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy HIGHBD_DECL_SUFFIX)452 static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
453 const pixel *src, ptrdiff_t src_stride,
454 const int w, int h, const int mx, int my,
455 const int dx, const int dy
456 HIGHBD_DECL_SUFFIX)
457 {
458 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
459 int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
460 int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
461
462 do {
463 int x;
464 int imx = mx, ioff = 0;
465
466 for (x = 0; x < w; x++) {
467 mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
468 4 - intermediate_bits);
469 imx += dx;
470 ioff += imx >> 10;
471 imx &= 0x3ff;
472 }
473
474 mid_ptr += 128;
475 src += PXSTRIDE(src_stride);
476 } while (--tmp_h);
477
478 mid_ptr = mid;
479 do {
480 int x;
481
482 for (x = 0; x < w; x++)
483 dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
484 4 + intermediate_bits);
485
486 my += dy;
487 mid_ptr += (my >> 10) * 128;
488 my &= 0x3ff;
489 dst += PXSTRIDE(dst_stride);
490 } while (--h);
491 }
492
prep_bilin_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my HIGHBD_DECL_SUFFIX)493 static void prep_bilin_c(int16_t *tmp,
494 const pixel *src, ptrdiff_t src_stride,
495 const int w, int h, const int mx, const int my
496 HIGHBD_DECL_SUFFIX)
497 {
498 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
499 src_stride = PXSTRIDE(src_stride);
500
501 if (mx) {
502 if (my) {
503 int16_t mid[128 * 129], *mid_ptr = mid;
504 int tmp_h = h + 1;
505
506 do {
507 for (int x = 0; x < w; x++)
508 mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
509 4 - intermediate_bits);
510
511 mid_ptr += 128;
512 src += src_stride;
513 } while (--tmp_h);
514
515 mid_ptr = mid;
516 do {
517 for (int x = 0; x < w; x++)
518 tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
519 PREP_BIAS;
520
521 mid_ptr += 128;
522 tmp += w;
523 } while (--h);
524 } else {
525 do {
526 for (int x = 0; x < w; x++)
527 tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
528 4 - intermediate_bits) -
529 PREP_BIAS;
530
531 tmp += w;
532 src += src_stride;
533 } while (--h);
534 }
535 } else if (my) {
536 do {
537 for (int x = 0; x < w; x++)
538 tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
539 4 - intermediate_bits) - PREP_BIAS;
540
541 tmp += w;
542 src += src_stride;
543 } while (--h);
544 } else
545 prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
546 }
547
prep_bilin_scaled_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy HIGHBD_DECL_SUFFIX)548 static void prep_bilin_scaled_c(int16_t *tmp,
549 const pixel *src, ptrdiff_t src_stride,
550 const int w, int h, const int mx, int my,
551 const int dx, const int dy HIGHBD_DECL_SUFFIX)
552 {
553 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
554 int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
555 int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
556
557 do {
558 int x;
559 int imx = mx, ioff = 0;
560
561 for (x = 0; x < w; x++) {
562 mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
563 4 - intermediate_bits);
564 imx += dx;
565 ioff += imx >> 10;
566 imx &= 0x3ff;
567 }
568
569 mid_ptr += 128;
570 src += PXSTRIDE(src_stride);
571 } while (--tmp_h);
572
573 mid_ptr = mid;
574 do {
575 int x;
576
577 for (x = 0; x < w; x++)
578 tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
579
580 my += dy;
581 mid_ptr += (my >> 10) * 128;
582 my &= 0x3ff;
583 tmp += w;
584 } while (--h);
585 }
586
avg_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h HIGHBD_DECL_SUFFIX)587 static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
588 const int16_t *tmp1, const int16_t *tmp2, const int w, int h
589 HIGHBD_DECL_SUFFIX)
590 {
591 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
592 const int sh = intermediate_bits + 1;
593 const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
594 do {
595 for (int x = 0; x < w; x++)
596 dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
597
598 tmp1 += w;
599 tmp2 += w;
600 dst += PXSTRIDE(dst_stride);
601 } while (--h);
602 }
603
w_avg_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,const int weight HIGHBD_DECL_SUFFIX)604 static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
605 const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
606 const int weight HIGHBD_DECL_SUFFIX)
607 {
608 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
609 const int sh = intermediate_bits + 4;
610 const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
611 do {
612 for (int x = 0; x < w; x++)
613 dst[x] = iclip_pixel((tmp1[x] * weight +
614 tmp2[x] * (16 - weight) + rnd) >> sh);
615
616 tmp1 += w;
617 tmp2 += w;
618 dst += PXSTRIDE(dst_stride);
619 } while (--h);
620 }
621
mask_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,const uint8_t * mask HIGHBD_DECL_SUFFIX)622 static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
623 const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
624 const uint8_t *mask HIGHBD_DECL_SUFFIX)
625 {
626 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
627 const int sh = intermediate_bits + 6;
628 const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
629 do {
630 for (int x = 0; x < w; x++)
631 dst[x] = iclip_pixel((tmp1[x] * mask[x] +
632 tmp2[x] * (64 - mask[x]) + rnd) >> sh);
633
634 tmp1 += w;
635 tmp2 += w;
636 mask += w;
637 dst += PXSTRIDE(dst_stride);
638 } while (--h);
639 }
640
641 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
blend_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h,const uint8_t * mask)642 static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
643 const int w, int h, const uint8_t *mask)
644 {
645 do {
646 for (int x = 0; x < w; x++) {
647 dst[x] = blend_px(dst[x], tmp[x], mask[x]);
648 }
649 dst += PXSTRIDE(dst_stride);
650 tmp += w;
651 mask += w;
652 } while (--h);
653 }
654
blend_v_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h)655 static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
656 const int w, int h)
657 {
658 const uint8_t *const mask = &dav1d_obmc_masks[w];
659 do {
660 for (int x = 0; x < (w * 3) >> 2; x++) {
661 dst[x] = blend_px(dst[x], tmp[x], mask[x]);
662 }
663 dst += PXSTRIDE(dst_stride);
664 tmp += w;
665 } while (--h);
666 }
667
blend_h_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h)668 static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
669 const int w, int h)
670 {
671 const uint8_t *mask = &dav1d_obmc_masks[h];
672 h = (h * 3) >> 2;
673 do {
674 const int m = *mask++;
675 for (int x = 0; x < w; x++) {
676 dst[x] = blend_px(dst[x], tmp[x], m);
677 }
678 dst += PXSTRIDE(dst_stride);
679 tmp += w;
680 } while (--h);
681 }
682
w_mask_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,uint8_t * mask,const int sign,const int ss_hor,const int ss_ver HIGHBD_DECL_SUFFIX)683 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
684 const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
685 uint8_t *mask, const int sign,
686 const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
687 {
688 // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
689 // and then load this intermediate to calculate final value for odd rows
690 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
691 const int bitdepth = bitdepth_from_max(bitdepth_max);
692 const int sh = intermediate_bits + 6;
693 const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
694 const int mask_sh = bitdepth + intermediate_bits - 4;
695 const int mask_rnd = 1 << (mask_sh - 5);
696 do {
697 for (int x = 0; x < w; x++) {
698 const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
699 dst[x] = iclip_pixel((tmp1[x] * m +
700 tmp2[x] * (64 - m) + rnd) >> sh);
701
702 if (ss_hor) {
703 x++;
704
705 const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
706 dst[x] = iclip_pixel((tmp1[x] * n +
707 tmp2[x] * (64 - n) + rnd) >> sh);
708
709 if (h & ss_ver) {
710 mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
711 } else if (ss_ver) {
712 mask[x >> 1] = m + n;
713 } else {
714 mask[x >> 1] = (m + n + 1 - sign) >> 1;
715 }
716 } else {
717 mask[x] = m;
718 }
719 }
720
721 tmp1 += w;
722 tmp2 += w;
723 dst += PXSTRIDE(dst_stride);
724 if (!ss_ver || (h & 1)) mask += w >> ss_hor;
725 } while (--h);
726 }
727
728 #define w_mask_fns(ssn, ss_hor, ss_ver) \
729 static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
730 const int16_t *const tmp1, const int16_t *const tmp2, \
731 const int w, const int h, uint8_t *mask, \
732 const int sign HIGHBD_DECL_SUFFIX) \
733 { \
734 w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
735 HIGHBD_TAIL_SUFFIX); \
736 }
737
738 w_mask_fns(444, 0, 0);
739 w_mask_fns(422, 1, 0);
740 w_mask_fns(420, 1, 1);
741
742 #undef w_mask_fns
743
744 #define FILTER_WARP_RND(src, x, F, stride, sh) \
745 ((F[0] * src[x - 3 * stride] + \
746 F[1] * src[x - 2 * stride] + \
747 F[2] * src[x - 1 * stride] + \
748 F[3] * src[x + 0 * stride] + \
749 F[4] * src[x + 1 * stride] + \
750 F[5] * src[x + 2 * stride] + \
751 F[6] * src[x + 3 * stride] + \
752 F[7] * src[x + 4 * stride] + \
753 ((1 << (sh)) >> 1)) >> (sh))
754
755 #define FILTER_WARP_CLIP(src, x, F, stride, sh) \
756 iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
757
warp_affine_8x8_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int16_t * const abcd,int mx,int my HIGHBD_DECL_SUFFIX)758 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
759 const pixel *src, const ptrdiff_t src_stride,
760 const int16_t *const abcd, int mx, int my
761 HIGHBD_DECL_SUFFIX)
762 {
763 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
764 int16_t mid[15 * 8], *mid_ptr = mid;
765
766 src -= 3 * PXSTRIDE(src_stride);
767 for (int y = 0; y < 15; y++, mx += abcd[1]) {
768 for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
769 const int8_t *const filter =
770 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
771
772 mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
773 7 - intermediate_bits);
774 }
775 src += PXSTRIDE(src_stride);
776 mid_ptr += 8;
777 }
778
779 mid_ptr = &mid[3 * 8];
780 for (int y = 0; y < 8; y++, my += abcd[3]) {
781 for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
782 const int8_t *const filter =
783 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
784
785 dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
786 7 + intermediate_bits);
787 }
788 mid_ptr += 8;
789 dst += PXSTRIDE(dst_stride);
790 }
791 }
792
warp_affine_8x8t_c(int16_t * tmp,const ptrdiff_t tmp_stride,const pixel * src,const ptrdiff_t src_stride,const int16_t * const abcd,int mx,int my HIGHBD_DECL_SUFFIX)793 static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
794 const pixel *src, const ptrdiff_t src_stride,
795 const int16_t *const abcd, int mx, int my
796 HIGHBD_DECL_SUFFIX)
797 {
798 const int intermediate_bits = get_intermediate_bits(bitdepth_max);
799 int16_t mid[15 * 8], *mid_ptr = mid;
800
801 src -= 3 * PXSTRIDE(src_stride);
802 for (int y = 0; y < 15; y++, mx += abcd[1]) {
803 for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
804 const int8_t *const filter =
805 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
806
807 mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
808 7 - intermediate_bits);
809 }
810 src += PXSTRIDE(src_stride);
811 mid_ptr += 8;
812 }
813
814 mid_ptr = &mid[3 * 8];
815 for (int y = 0; y < 8; y++, my += abcd[3]) {
816 for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
817 const int8_t *const filter =
818 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
819
820 tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
821 }
822 mid_ptr += 8;
823 tmp += tmp_stride;
824 }
825 }
826
emu_edge_c(const intptr_t bw,const intptr_t bh,const intptr_t iw,const intptr_t ih,const intptr_t x,const intptr_t y,pixel * dst,const ptrdiff_t dst_stride,const pixel * ref,const ptrdiff_t ref_stride)827 static void emu_edge_c(const intptr_t bw, const intptr_t bh,
828 const intptr_t iw, const intptr_t ih,
829 const intptr_t x, const intptr_t y,
830 pixel *dst, const ptrdiff_t dst_stride,
831 const pixel *ref, const ptrdiff_t ref_stride)
832 {
833 // find offset in reference of visible block to copy
834 ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
835 iclip((int) x, 0, (int) iw - 1);
836
837 // number of pixels to extend (left, right, top, bottom)
838 const int left_ext = iclip((int) -x, 0, (int) bw - 1);
839 const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
840 assert(left_ext + right_ext < bw);
841 const int top_ext = iclip((int) -y, 0, (int) bh - 1);
842 const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
843 assert(top_ext + bottom_ext < bh);
844
845 // copy visible portion first
846 pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
847 const int center_w = (int) (bw - left_ext - right_ext);
848 const int center_h = (int) (bh - top_ext - bottom_ext);
849 for (int y = 0; y < center_h; y++) {
850 pixel_copy(blk + left_ext, ref, center_w);
851 // extend left edge for this line
852 if (left_ext)
853 pixel_set(blk, blk[left_ext], left_ext);
854 // extend right edge for this line
855 if (right_ext)
856 pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
857 right_ext);
858 ref += PXSTRIDE(ref_stride);
859 blk += PXSTRIDE(dst_stride);
860 }
861
862 // copy top
863 blk = dst + top_ext * PXSTRIDE(dst_stride);
864 for (int y = 0; y < top_ext; y++) {
865 pixel_copy(dst, blk, bw);
866 dst += PXSTRIDE(dst_stride);
867 }
868
869 // copy bottom
870 dst += center_h * PXSTRIDE(dst_stride);
871 for (int y = 0; y < bottom_ext; y++) {
872 pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
873 dst += PXSTRIDE(dst_stride);
874 }
875 }
876
resize_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int dst_w,int h,const int src_w,const int dx,const int mx0 HIGHBD_DECL_SUFFIX)877 static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
878 const pixel *src, const ptrdiff_t src_stride,
879 const int dst_w, int h, const int src_w,
880 const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
881 {
882 do {
883 int mx = mx0, src_x = -1;
884 for (int x = 0; x < dst_w; x++) {
885 const int8_t *const F = dav1d_resize_filter[mx >> 8];
886 dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
887 F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
888 F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
889 F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
890 F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
891 F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
892 F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
893 F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
894 64) >> 7);
895 mx += dx;
896 src_x += mx >> 14;
897 mx &= 0x3fff;
898 }
899
900 dst += PXSTRIDE(dst_stride);
901 src += PXSTRIDE(src_stride);
902 } while (--h);
903 }
904
bitfn(dav1d_mc_dsp_init)905 COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
906 #define init_mc_fns(type, name) do { \
907 c->mc [type] = put_##name##_c; \
908 c->mc_scaled [type] = put_##name##_scaled_c; \
909 c->mct [type] = prep_##name##_c; \
910 c->mct_scaled[type] = prep_##name##_scaled_c; \
911 } while (0)
912
913 init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular);
914 init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
915 init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp);
916 init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular);
917 init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth);
918 init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp);
919 init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
920 init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth);
921 init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp);
922 init_mc_fns(FILTER_2D_BILINEAR, bilin);
923
924 c->avg = avg_c;
925 c->w_avg = w_avg_c;
926 c->mask = mask_c;
927 c->blend = blend_c;
928 c->blend_v = blend_v_c;
929 c->blend_h = blend_h_c;
930 c->w_mask[0] = w_mask_444_c;
931 c->w_mask[1] = w_mask_422_c;
932 c->w_mask[2] = w_mask_420_c;
933 c->warp8x8 = warp_affine_8x8_c;
934 c->warp8x8t = warp_affine_8x8t_c;
935 c->emu_edge = emu_edge_c;
936 c->resize = resize_c;
937
938 #if HAVE_ASM
939 #if ARCH_AARCH64 || ARCH_ARM
940 bitfn(dav1d_mc_dsp_init_arm)(c);
941 #elif ARCH_X86
942 bitfn(dav1d_mc_dsp_init_x86)(c);
943 #endif
944 #endif
945 }
946